Compare commits

..

567 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
3545633934 docs/CHANGELOG.md: cut v1.95.1 2023-11-16 20:31:59 +01:00
Aliaksandr Valialkin
e9d86d7e52 vendor: run make vendor-update 2023-11-16 20:20:27 +01:00
Aliaksandr Valialkin
aefd744abb dashboards: remove path!="/favicon.ico" filter from requests rate graphs
The `path!="/favicon.ico"` filter has little sense, since there are many other special paths,
which may be filtered out - /metrics, /flags, /health, /ping, /robots.txt, /-/healthy, /-/ready, /reload, etc.
See /lib/httpserver/httpserver.go for more details.
It will be hard or impossible to maintain filters for all these paths, so it is better to drop this filter
in order to simplify queries and improve the consistency of these queries.
2023-11-16 19:28:49 +01:00
Aliaksandr Valialkin
aae06e003e app/vmselect: simplify code a bit after 63e0f16062
Use only a single call to prometheus.WriteErrorResponse() inside sendPrometheusError
2023-11-16 18:14:33 +01:00
Aliaksandr Valialkin
84b3c1d3bc docs/FAQ.md: add a link to https://docs.victoriametrics.com/#monitoring in questions where this is needed 2023-11-16 17:45:30 +01:00
Aliaksandr Valialkin
2ea03cf80d lib/handshake: add SetReadDeadline and SetWriteDeadline implementations additionally to SetDeadline
This is a follow-up for 27a5461785

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5327
2023-11-16 16:48:05 +01:00
Roman Khavronenko
1fbd0dd9d8 lib/handshake: check for deadline in Read and Write methods (#5327)
The buffered connection could have exceeded the underlying connection
deadline during reading or writing to an internal buffer.
With this change, buffered connection struct additionally checks
for a deadline in Read/Write methods.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-16 16:47:46 +01:00
Github Actions
8924fea33a Automatic update operator docs from VictoriaMetrics/operator@bc8b02f (#5331) 2023-11-16 16:29:37 +01:00
Roman Khavronenko
8dfc874be3 docs/vmalert: clarify deduplication recommendations for HA setup (#5336)
Please see discussion here https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5279

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-16 16:26:57 +01:00
Aliaksandr Valialkin
61035419d5 docs/CHANGELOG.md: remove duplicate word query after 2cbdb1db22 2023-11-16 16:24:03 +01:00
Aliaksandr Valialkin
2cbdb1db22 app/vmselect/promql: properly handle duplicate series when merging cached results with the results obtained from the database
evalRollupFuncNoCache() may return time series with identical labels (aka duplicate series)
when performing queries satisfying all the following conditions:

- It must select time series with multiple metric names. For example, {__name__=~"foo|bar"}
- The series selector must be wrapped into rollup function, which drops metric names. For example, rate({__name__=~"foo|bar"})
- The rollup function must be wrapped into aggregate function, which has no streaming optimization.
  For example, quantile(0.9, rate({__name__=~"foo|bar"})

In this case VictoriaMetrics shouldn't return `cannot merge series: duplicate series found` error.
Instead, it should fall back to query execution with disabled cache.

Also properly store the merged results. Previously they were incorrectly stored because of a typo
introduced in the commit 41a0fdaf39

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5332
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5337
2023-11-16 16:01:40 +01:00
hagen1778
d389a4fcf3 dashboards: use version instead of short_version in annotations
`version` label won't show the difference if various flavors of the same
version were deployed. But `short_version` will.

For example, on the sandbox env we test VM builds before new version release.
Without this change, the version update won't be visible on dashboard.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-16 09:26:47 +01:00
Github Actions
ec7cac3641 Automatic update Grafana datasource docs from VictoriaMetrics/grafana-datasource@95d0711 (#5329) 2023-11-16 11:12:32 +04:00
Aliaksandr Valialkin
0a28c8e91b docs/Articles.md: add https://hackernoon.com/unleashing-vm-histograms-for-ruby-migrating-from-prometheus-to-victoriametrics-with-vm-client 2023-11-16 00:53:44 +01:00
Yury Molodov
98e73a4022 vmui: change autocomplete hotkey to Alt/Option + A (#5328) 2023-11-15 23:33:10 +01:00
Aliaksandr Valialkin
17c45d1206 docs/vmbackup.md: fix links to https://docs.victoriametrics.com/vmbackup.html#permanent-deletion-of-objects-in-s3-compatible-storages
This is a follow-up for 2fc7e9f47e
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5121
2023-11-15 23:26:32 +01:00
Aliaksandr Valialkin
85bf63078c docs/vmagent.md: refer to proper command-line flag: -remoteWrite.shardByURL.labels instead of -remoteWrite.shardByURLLabels
This is a follow-up for ed70a40669

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942
2023-11-15 23:03:13 +01:00
Aliaksandr Valialkin
7d4873bcef docs: mention that VictoriaMetrics and vmagent support data ingestion via New Relic protocol now
This is a follow-up for f60c08a7bd
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520
2023-11-15 22:54:29 +01:00
Aliaksandr Valialkin
a79a439dac docs/Release-Guide.md: point to the proper location for CHANGELOG.md at github.com/VictoriaMetrics/operator repository 2023-11-15 21:55:45 +01:00
Aliaksandr Valialkin
4c25ee3597 deployment: update reference to VictoriaLogs from v0.4.1-victorialogs to v0.4.2-victorialogs 2023-11-15 20:46:53 +01:00
Aliaksandr Valialkin
ebd76588da docs/VictoriaLogs/README.md: cut v0.4.2-victorialogs 2023-11-15 20:43:57 +01:00
Aliaksandr Valialkin
9b248e3b2f deployment: update references to VictoriaMetrics components from v1.94.0 to v1.95.0 2023-11-15 20:35:11 +01:00
Aliaksandr Valialkin
0ea35215f6 app/vmalert-tool: add missing multiarch directory
This is needed for 'make publish-vmalert-tool'
2023-11-15 18:11:50 +01:00
Aliaksandr Valialkin
91f5c24f82 docs/CHANGELOG.md: cut v1.95.0 release 2023-11-15 17:45:52 +01:00
Aliaksandr Valialkin
8d9e365512 vendor: update github.com/klasuspost/compress from v1.17.2 to v1.17.3
See https://github.com/klauspost/compress/releases/tag/v1.17.3
2023-11-15 17:18:22 +01:00
Aliaksandr Valialkin
741013a33f docs/CHANGELOG.md: document v1.93.8 LTS release 2023-11-15 17:12:44 +01:00
Aliaksandr Valialkin
d9a7dea9a1 lib/querytracer: add missing blank comment line after 3121d76bee 2023-11-15 16:10:43 +01:00
Aliaksandr Valialkin
e837968e49 docs/stream-aggregation.md: clarify that stream aggregation is applied after all the configured relabeling
This is a follow-up after 68d2cb203d
2023-11-15 15:54:15 +01:00
Aliaksandr Valialkin
5bfa2a3e97 docs/CHANGELOG.md: document v1.87.11 LTS release 2023-11-15 15:53:05 +01:00
hagen1778
68d2cb203d docs/stream-aggr: specify the relabeling order during aggregation
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-15 14:28:03 +01:00
Aliaksandr Valialkin
02624c41eb app/vmctl/README.md: sync with docs/vmctl.md after 7b2e2a23c2 2023-11-15 12:25:33 +01:00
Aliaksandr Valialkin
ef2113e5df Makefile: remove package-base dependency from publish rule, since this dep is set inside all the publish-* dependencies
This is a follow-up for d4099a75be
2023-11-15 12:23:31 +01:00
John Belmonte
7b2e2a23c2 vmctl README.md typo (#5326) 2023-11-15 11:47:25 +04:00
John Belmonte
efd5d1f2dd relabeling.md: fix link (#5325) 2023-11-15 11:32:09 +04:00
Aliaksandr Valialkin
201fb6ec5c vendor: run make vendor-update 2023-11-14 22:45:07 +01:00
Aliaksandr Valialkin
3076c1f400 lib/ingestserver: properly log the number of closed connections
Previously there was off-by-one error, which resulted in logging len(conns-1) connections instead of len(conns)

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4922
2023-11-14 21:53:24 +01:00
Aliaksandr Valialkin
6a533023b1 docs/CHANGELOG.md: consistently prepend command-line flags with a single dash 2023-11-14 21:44:19 +01:00
Aliaksandr Valialkin
0bf48a5d2a docs/Cluster-VictoriaMetrics.md: clarify how -storage.vminsertConnsShutdownDuration command-line flag works 2023-11-14 21:41:47 +01:00
hagen1778
feff13851c docs: clarify vmalert flag changes
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-14 21:18:58 +01:00
Nikolay
3121d76bee lib/querytracer: makes package concurrent safe to use (#5322)
* lib/querytracer: makes package concurrent safe to use
it must fix various issues with concurrent code usage.
Especially, when it's not reasonable to wait for all goroutines to be finished

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-11-14 20:59:08 +01:00
Aliaksandr Valialkin
cb106bdf39 lib/logger: increase default -loggerMaxArgLen command-line flag value from 500 to 1000
The 500 chars limit for the maximum arg lengths during logging appeared to be too low for some cases
2023-11-14 19:52:27 +01:00
Artem Navoiev
5d61a7327d docs: update Grafana setup section, use more direct link and add noti… (#5287) 2023-11-14 14:34:01 +01:00
hagen1778
d3ae2b2f62 dashboards: update description for RSS and anonymous memory panels to be consistent for single-node, cluster and vmagent dashboards.
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-14 09:50:06 +01:00
hagen1778
d6ae082598 deployment/dashboards: respect job and instance filters for alerts annotation in cluster and single-node dashboards
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-14 09:38:15 +01:00
Aliaksandr Valialkin
fbc6289a21 app/vmselect/promql: typo fixes after 7cf7740d18 2023-11-14 03:34:37 +01:00
Aliaksandr Valialkin
f9bd265249 lib/ingestserver: typo fix after f7834767c1
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4922
2023-11-14 03:26:26 +01:00
Aliaksandr Valialkin
7cf7740d18 app/vmselect/promql: properly handle instant query optimization conrner cases for min_over_time() and max_over_time()
- If min_over_time(m[offset] @ timestamp) <= min_over_time(m[offset] @ (timestamp-window)),
  then the optimization can be applied.

- If max_over_time(m[offset] @ timestamp) >= max_over_time(m[offset] @ (timestamp-window)),
  then the optimization can be applied.
2023-11-14 02:58:08 +01:00
Yury Molodov
0116a333fb vmui: reduced the number of server requests (#5253)
* vmui: reduced the number of server requests

* run `make vmui-update vmui-logs-update`

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-11-14 01:50:00 +01:00
Aliaksandr Valialkin
43e3302803 docs/CHANGELOG.md: document 0e056ddb2d
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5203
2023-11-14 01:24:05 +01:00
Yury Molodov
0e056ddb2d vmui: fix trailing slash in serverURL (#5271)
* vmui: add function to autoremove slash at the end of serverURL (#5203)

* vmui: change removeTrailingSlash func
2023-11-14 01:21:16 +01:00
Noah Labrecque
12aefa8a4b fix: apply correct bounds to sf and tf (#5274) 2023-11-14 01:17:16 +01:00
Zakhar Bessarab
37997abd14 vmcluster: re-routing enhancement (#5293)
* app/vmstorage: close vminsert connections gradually before stopping storage

Implements graceful shutdown approach suggested here - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4922#issuecomment-1768146878

Test results for this can be found here - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4922#issuecomment-1790640274

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* app/vmstorage: update graceful shutdown logic

- close connections from vminsert in determenistic order
- update flag description
- lower default timeout to 25 seconds. 25 seconds value was chosen because the lowest default value used in default configuration deployments is 30s(default value in Kubernetes and ansible-playbooks).

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/cluster: add information about re-routing enhancement during restart

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/changelog: add entry for new command-line flag

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* {app/vmstorage,lib/ingestserver}: address review feedback

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/cluster: add note to update workload scheduler timeout

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* wip

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-11-14 01:03:44 +01:00
Aliaksandr Valialkin
cef7a39ba3 lib/logstorage: always check the previous indexBlockHeader for blocks with matching tenantID and/or streamID
The previous indexBlockHeader may contain blocks for the matching tenantID and/or streamID,
so it must be scanned unconditionally during the search.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5295
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4856

This is a follow-up for 89dcbc2fe7
2023-11-13 23:13:53 +01:00
XLONG96
89dcbc2fe7 lib/logstorage: fix streamID and tenantID search (#4856) (#5295) 2023-11-13 23:09:39 +01:00
Aliaksandr Valialkin
8eed04b2c6 app/vmauth: add ability to drop the specified number of /-delimited prefix parts from request path
This can be done via `drop_src_path_prefix_parts` option at `url_map` and `user` levels.

See https://docs.victoriametrics.com/vmauth.html#dropping-request-path-prefix
2023-11-13 22:32:22 +01:00
Aliaksandr Valialkin
0feaeca3c1 lib/protoparser/promremotewrite: fall back to Snappy decoding if zstd decoding fails
This case is possible after the following steps:

1. vmagent tries to perform handshake with the -remoteWrite.url in order to determine whether
   the remote storage supports zstd-compressed data.
2. The remote storage is unavailable during the handshake. In this case vmagent falls back to Snappy compression
   for the data sent to the remote storage.
3. vmagent compresses the collected data into blocks with Snappy and puts these blocks to persistent queue on disk.
4. The remote storage becomes available.
5. vmagent restarts, performs the handshake with the remote storage and detects that it supports zstd-compressed data.
6. vmagent starts sending Snappy-compressed data from persistent queue to the remote storage,
   while falsely advertizing it sends zstd-compressed data.
7. The remote storage receives Snappy-compressed data and fails unpacking it with zstd.

The solution is to just fall back to Snappy decompression if zstd decompression fails.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5301
2023-11-13 21:19:08 +01:00
Aliaksandr Valialkin
8af56ea2ed lib/htmlcomponents: use relative links for the top page and for favicon.ico
This allows hiding VictoriaMetrics components behind proxies with arbitrary path prefixes.
For example, vmagent HTTP handlers can be served via /vmagent/ path prefix:

- http://proxy/vmagent/targets
- http://proxy/vmagent/service-discovery

The path prefix can be arbitrary. For example, below are vmagent urls
for /tenantID/vmagent/ path prefix:

- http://proxy/tenantID/vmagent/targets
- http://proxy/tenantID/vmagent/service-discovery

While at it, consistently serve favicon.ico from any path directory.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5306
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5307
2023-11-13 20:29:05 +01:00
Aliaksandr Valialkin
cf23dc6480 all: cleanup: remove // +build ... lines, since they are no longer needed after Go1.17, and the minimum supported Go version for VictoriaMetrics source code is Go1.20 2023-11-13 19:12:51 +01:00
Aliaksandr Valialkin
b7fb7c5f77 vendor: run make vendor-update 2023-11-13 18:50:16 +01:00
Aliaksandr Valialkin
3e93fa61ad lib/regexutil: properly handle alternate regexps surrounded by .+ or .*
Previously the following regexps were improperly handled:

  .+foo|bar.+
  .*foo|bar.*

This could lead to unexpected regexp match results.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297

Thanks to @Haleygo for the initial attempt to fix the issue at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5308
2023-11-13 18:23:38 +01:00
Aliaksandr Valialkin
c8d901424f docs/VictoriaLogs/CHANGELOG.md: follow-up for 66527c5981
Document the change

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5312
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5300
2023-11-13 10:39:48 +01:00
Yury Molodov
66527c5981 vmui: ui logs enhancements (#5312)
* vmui/logs: fix time sorting #5300

* vmui/logs: add base query validation

* vmui/logs: add a message for empty results
2023-11-13 10:36:52 +01:00
Aliaksandr Valialkin
6340911d38 lib/stringsutil: add tests for LimitStringLen() function 2023-11-13 10:32:33 +01:00
Dmytro Kozlov
4722b70c89 lib/stringsutil: fix failing test (#5313)
We have failed test on master branch.

```
--- FAIL: TestFormatLogMessage (0.00s)
    logger_test.go:24: unexpected result; got
        "foo: abcde, \"foo bar baz\", xx"
        want
        "foo: a..e, \"f..z\", xx"
```
if failed because maxArgs maxLen <= 4 in the  `LimitStringLen` in that case we always will return the income string
but in the test we limit the maxLen by value 4
```
f("foo: %s, %q, %s", []interface{}{"abcde", fmt.Errorf("foo bar baz"), "xx"}, 4, `foo: a..e, "f..z", xx`)
2023-11-13 09:51:49 +01:00
Aliaksandr Valialkin
ba058a4514 docs/CHANGELOG.md: remove trailing whitespace after bffd30b57a 2023-11-13 09:24:29 +01:00
Aliaksandr Valialkin
25ff811d78 docs/vmauth.md: add missing dashes in front of command-line flags at the Backend TLS setup section
Dashes must be consistently used in front of command-line flags across the documentation.

This is a follow up for 61594d2bd8
2023-11-13 09:12:57 +01:00
Aliaksandr Valialkin
eded218e8c app/vmauth: properly pass Host header to backends
Previously the `Host` header was remained unchanged when passing it in requests to backends.
This may improperly work if the backend uses host-based routing.

While at it, allows http/2.0 requests to backends. While VictoriaMetrics components
do not accept http/2.0 requests, other backends can require such requests.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5240
2023-11-13 09:05:39 +01:00
Aliaksandr Valialkin
61594d2bd8 app/vmauth: follow-up for 323f3720ed
- Re-use identically configured http.Transport across multiple users.
  This fixes handling of the limit on the number of connection, which can be established per each backend
  via -maxIdleConnsPerBackend command-line flag. This limit stopped working after 323f3720ed

- Add docs about backend TLS setup at https://docs.victoriametrics.com/vmauth.html#backend-tls-setup

- Add ability to disable backend TLS verification for all the users via -backend.tlsInsecureSkipVerify command-line flag.
  This flag may be useful when -auth.config contains big number of users, and every user must disable backend TLS verification.

- Add ability to specify TLS Root CA via tls_ca_file option at per-user basis and via -backend.tlsCAFile command-line flag
  across all the users.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5240
2023-11-13 08:33:10 +01:00
Aliaksandr Valialkin
ec72b750f2 docs/Articles.md: typo fix 2023-11-13 08:05:16 +01:00
Aliaksandr Valialkin
bfec8a3751 app/vmauth: improve docs a bit after 323f3720ed
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5240
2023-11-11 12:49:28 +01:00
Aliaksandr Valialkin
0df8489e0a app/vmagent/README.md: sync with docs/vmagent.md after 930d26b2ff 2023-11-11 12:31:32 +01:00
Aliaksandr Valialkin
230230cf0b lib/logger: add -loggerMaxArgLen command-line flag for fine-tuning the maximum length of logged args 2023-11-11 12:30:08 +01:00
Aliaksandr Valialkin
80213f07fa app/vmselect/promql: optimize instant queries with min_over_time() and max_over_time() rollup functions
This is a follow-up for 41a0fdaf39
2023-11-11 12:10:03 +01:00
Aliaksandr Valialkin
2db1a664e1 deployment: update Go builder from Go1.21.3 to Go1.21.4
See https://github.com/golang/go/issues?q=milestone%3AGo1.21.4+label%3ACherryPickApproved
2023-11-10 22:28:44 +01:00
Aliaksandr Valialkin
010dc15d16 lib/blockcache: do not cache entries, which were attempted to be accessed 1 or 2 times
Previously entries which were accessed only 1 time weren't cached.
It has been appeared that some rarely executed heavy queries may read indexdb block twice
in a row instead of once. There is no need in caching such a block then.
This change should eliminate cache size spikes for indexdb/dataBlocks when such heavy queries are executed.

Expose -blockcache.missesBeforeCaching command-line flag, which can be used for fine-tuning
the number of cache misses needed before storing the block in the caching.
2023-11-10 22:28:03 +01:00
Aliaksandr Valialkin
22498c5087 docs/Articles.md: sort third-party articles by importance 2023-11-10 21:26:51 +01:00
Aliaksandr Valialkin
dc668b8246 docs/Articles.md: add a link to https://blog.cloudflare.com/introducing-http-traffic-anomalies-notifications/ 2023-11-10 20:59:57 +01:00
Aliaksandr Valialkin
a9a26c20b5 docs/Single-server-VictoriaMetrics.md: make High availability section more clear 2023-11-10 20:58:32 +01:00
Aliaksandr Valialkin
d407d13e7b Makefile: update golangci-lint version from v1.54.2 to v1.55.1
See https://github.com/golangci/golangci-lint/releases/tag/v1.55.1
2023-11-10 20:23:48 +01:00
PhracturedBlue
2474281f1b Support building images via podman (#4978) 2023-11-09 00:50:21 -08:00
Zakhar Bessarab
73a1862182 docs/changelog: document vmbackupmanager bugfix (#5303)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-11-08 18:51:14 +01:00
Artem Navoiev
930d26b2ff docs: vmagent change the codeblock languages
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-08 18:12:29 +01:00
Github Actions
2b15f2fc8c Automatic update Grafana datasource docs from VictoriaMetrics/grafana-datasource@cc18249 (#5305) 2023-11-08 18:19:12 +04:00
Github Actions
746cc93e95 Automatic update Grafana datasource docs from VictoriaMetrics/grafana-datasource@52bdb4a (#5304) 2023-11-08 17:59:10 +04:00
Roman Khavronenko
bffd30b57a app/vmalert: update remote-write process (#5284)
* app/vmalert: update remote-write process

* automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers.
* increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded.
* increment `vmalert_remotewrite_dropped_rows_total` amd `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* Update docs/CHANGELOG.md

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Hui Wang <haley@victoriametrics.com>
2023-11-08 14:53:07 +08:00
Artem Navoiev
5afc6a5765 github actions: sync docs use the latest hugo version in CI
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-07 12:23:12 +01:00
Artem Navoiev
b51d16e74c docs: url example change the title h2->h3 h3->h4 for better indexing in search
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-07 09:54:32 +01:00
Artem Navoiev
e4f44bad91 docs: fix formatting in stream aggregation more
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-07 09:31:21 +01:00
Artem Navoiev
ea4ca70be1 docs: fix formatting in stream aggregation
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-07 09:27:44 +01:00
Github Actions
4d2b98b755 Automatic update operator docs from VictoriaMetrics/operator@b4b79da (#5291) 2023-11-06 16:04:12 +08:00
hagen1778
c07dc45786 app/vmalert: fix typo in remoteWrite.concurrency description
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-03 22:04:50 +01:00
Yury Molodov
f90d2ec843 vmui: display query error on Explore metrics page (#5272)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5202
2023-11-03 16:23:19 +01:00
hagen1778
054367c421 docs: make docs-sync after 323f3720ed
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-03 16:22:23 +01:00
Zakhar Bessarab
323f3720ed app/vmauth: add option to skip TLS verification (#5256)
Add `tls_insecure_skip_verify` option on per-user basis which allows to disable TLS verification for all requests to backend on behalf of this user.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5240

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-11-03 12:04:17 +01:00
Aliaksandr Valialkin
da24c129db vendor: run make vendor-update 2023-11-02 21:01:21 +01:00
Aliaksandr Valialkin
f2b373dd06 go.mod: pin the latest working version of golang.org/x/exp 2023-11-02 20:55:24 +01:00
Aliaksandr Valialkin
815fda8995 docs: update -help output after recent changes to VictoriaMetrics components 2023-11-02 20:27:10 +01:00
Aliaksandr Valialkin
65db6609eb docs/CHANGELOG.md: update the description of the optimization for SLO/SLI-like queries according to latest changes
See commits 4497a08e3d and 92826b0b4a
2023-11-02 20:05:05 +01:00
Roman Khavronenko
b5254199c6 app/vmalert: add label file pointing to the group's filename to metrics (#5281)
The filename should help identifying alerting rules belonging to specific groups
with identical names but different filenames.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5267

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-02 16:01:31 +01:00
hagen1778
6eb205f8b0 app/vmalert: verify alert name correctness in restore test
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-11-02 15:28:39 +01:00
Hui Wang
90d45574bf vmalert: reduce restore query request for each alerting rule (#5265)
reduce the number of queries for restoring alerts state on start-up. 
The change should speed up the restore process and reduce pressure on `remoteRead.url`.
2023-11-02 15:22:13 +01:00
Aliaksandr Valialkin
eeb25bc4a5 app/vmselect/promql: add missing trace message in rollupResultCache.GetSeries() 2023-11-02 09:22:48 +01:00
Aliaksandr Valialkin
dd33fc0c76 docs/CHANGELOG.md: typo fix: tis -> this 2023-11-02 08:33:40 +01:00
Aliaksandr Valialkin
536c1301fd docs/Single-server-VictoriaMetrics.md: document why data inside <-storageDataPath>/snapshots directory should be manipulated only via snapshot API 2023-11-02 08:30:39 +01:00
Aliaksandr Valialkin
87a86ec9db docs/CHANGELOG.md: document v1.93.7 LTS release 2023-11-02 08:21:00 +01:00
Aliaksandr Valialkin
ed70a40669 app/vmagent/remotewrite: add -remoteWrite.shardByURL.labels command-line flag
This command-line flag can be used for specifying a list of labels used for sharding
among -remoteWrite.url entries when -remoteWrite.shardByURL command-line flag is set.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942
2023-11-01 23:08:54 +01:00
Alexander Marshalov
828ddd4e4f vmauth: add browser authorization request for http requests without… (#5234)
* vmauth: add browser authorization request for http requests without credentials to a route that is not in the `unauthorized_user` section (when `unauthorized_user` is specified).

* add link to issue in CHANGELOG

* Extend vmauth docs

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-11-01 20:59:46 +01:00
Aliaksandr Valialkin
4497a08e3d app/vmselect/promql: reduce the minimum lookbehind window for enabling SLO/SLI optimizations from 24 hours to 6 hours
This reduction is based on production testing.

Also expose -search.minWindowForInstantRollupOptimization command-line flag, so users could fine-tune this arg for their needs
2023-11-01 20:18:44 +01:00
Aliaksandr Valialkin
f59dda3223 app/vmselect: run make quicktemplate-gen after b8739bc00b 2023-11-01 17:52:56 +01:00
Aliaksandr Valialkin
b8739bc00b app/vmselect: return stats.seriesFetched as string instead of number
vmalert expects string value for stats.seriesFetched, so it is impossible
switching to number without breaking compatibility with old vmalert releases :(

It is still unclear why stats.seriesFetched has string type in the first place...
2023-11-01 17:49:59 +01:00
Github Actions
b44b74d118 Automatic update operator docs from VictoriaMetrics/operator@49826be (#5270)
Co-authored-by: Alexander Marshalov <_@marshalov.org>
2023-11-01 17:45:56 +01:00
Artem Navoiev
324ecbeed6 add Try new Docs button in the current docs
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-11-01 17:22:49 +01:00
Aliaksandr Valialkin
da887b49e7 app/vmui: show query execution duration in the header of query input field
This should simplify the process of query optimization
2023-11-01 16:43:51 +01:00
Hui Wang
e482eeff58 vmalert: support specifying full http url in notifier static_configs target (#5261)
* vmalert: support specifying full http or https urls in notifier static_configs target address
* show right label results in ui
2023-11-01 19:53:50 +08:00
Aliaksandr Valialkin
92826b0b4a app/vmselect/promql: apply SLO-like optimization to all the count_*_over_time() functions
This is a follow-up for 41a0fdaf39
2023-11-01 09:58:16 +01:00
Aliaksandr Valialkin
0189081490 app/vmselect/promql: typo fix, which could lead to panic during range query execution
The panic is:

  BUG: unexpected values after merging new values

This is a follow-up for 41a0fdaf39
2023-11-01 09:58:15 +01:00
Github Actions
3e76c3bd50 Automatic update operator docs from VictoriaMetrics/operator@57c1bf6 (#5266) 2023-11-01 16:33:25 +08:00
Aliaksandr Valialkin
c4c6ee9485 app/vmui: fix non-working Disable cache checkbox at JSON and Table views 2023-10-31 22:58:06 +01:00
Aliaksandr Valialkin
a70818f72f app/vmselect/promql: properly calculate rollup result if lookbehind window isn't set
This is a follow-up for 41a0fdaf39
2023-10-31 22:22:37 +01:00
Aliaksandr Valialkin
ea81f6fc36 app/vmselect/promql: add outliers_iqr(q) and outlier_iqr_over_time(m[d]) functions
These functions allow detecting anomalies in series and samples using Interquartile range method.
See Outliers section at https://en.wikipedia.org/wiki/Interquartile_range for more details.
2023-10-31 22:10:31 +01:00
Aliaksandr Valialkin
fba93dbe0b vendor: run make vendor-update 2023-10-31 20:19:51 +01:00
Aliaksandr Valialkin
41a0fdaf39 app/vmselect/promql: optimize repeated SLI-like instant queries with lookbehind windows >= 1d
Repeated instant queries with long lookbehind windows, which contain one of the following rollup functions,
are optimized via partial result caching:

- sum_over_time()
- count_over_time()
- avg_over_time()
- increase()
- rate()

The basic idea of optimization is to calculate

  rf(m[d] @ t)

as

  rf(m[offset] @ t) + rf(m[d] @ (t-offset)) - rf(m[offset] @ (t-d))

where rf(m[d] @ (t-offset)) is cached query result, which was calculated previously

The offset may be in the range of up to 1 hour.
2023-10-31 19:25:23 +01:00
Aliaksandr Valialkin
c96fc05f3e docs/Cluster-VictoriaMetrics.md: sync with cluster branch after 9d8f93050c 2023-10-31 19:13:11 +01:00
Aliaksandr Valialkin
51aab7bb17 app/vmselect/promql: wrap too long line after a950873fff 2023-10-31 18:59:10 +01:00
Aliaksandr Valialkin
714af89b13 lib/httpserver: follow-up for 0638bbe69c
- Replace spaces with underscores in the `reason` label value for the vm_http_request_errors_total metric
  in order be consistent with Prometheus-like naming

- Clarify the description for the change at docs/CHANGELOG.md

Updates https://github.com/victoriaMetrics/victoriaMetrics/issues/4590
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5166
2023-10-31 18:52:39 +01:00
Aliaksandr Valialkin
98699f203b lib/persistentqueue: properly re-create flock.lock file inside directory if persistent queue is broken.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5249

Thanks to @Sniper91 for the bugreport and initial fix at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5233
2023-10-31 18:38:32 +01:00
Aliaksandr Valialkin
efb6ac27c2 lib/httpserver: call Request.Header() only once instead of calling it each time a new request header is set
This is a follow-up for ad839aa492
2023-10-31 18:38:32 +01:00
Artem Navoiev
68f82b1c06 github actions: fix typo in hugo version
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-31 17:52:35 +01:00
Artem Navoiev
1020faa7b4 github actions: use 0.119 hugo version as far latest contains bug
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-31 17:50:12 +01:00
Aliaksandr Valialkin
aade16f534 docs/Cluster-VictoriaMetrics.md: clarify the description on why -dosnwampling.period must be set at both vmstorage and vmselect
This is a follow-up for ca7457d906
2023-10-31 16:53:46 +01:00
Aliaksandr Valialkin
a71efce784 docs/Single-server-VictoriaMetrics.md: cosmetic fixes after 23369321f1 2023-10-31 16:42:29 +01:00
Aliaksandr Valialkin
4ac95b6f49 docs/CHANGELOG.md: move the description for -http.header.* command-line flags from SECURITY to FEATURE
The SECURITY label should be applied only to changes, which fix security issues.
The change at ad839aa492 adds new command-line flags, which can be used
for improving security in some cases. They do not fix any security issues.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5111
2023-10-31 16:23:08 +01:00
Aliaksandr Valialkin
7ac49162c6 lib/storage: follow-up for 29cebd82fb
Use atomic.CompareAndSwapUint32() instead of atomic.LoadUint32() followed by atomic.StoreUint32().
This makes the code more clear.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5159
2023-10-31 16:08:54 +01:00
hagen1778
f6208965ce dashboards/cluster: fix description about max threshold for Concurrent selects panel.
Before, it was mistakenly implying that `max` is equal to the double of available CPUs.

Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5214

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 16:05:33 +01:00
Roman Khavronenko
a950873fff app/vmselect: expose vm_memory_intensive_queries_total counter metric (#5208)
The new metric gets increased each time `-search.logQueryMemoryUsage` memory limit
is exceeded by a query. This metric should help to identify expensive and heavy queries
without inspecting the logs.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 13:31:09 +01:00
hagen1778
a8051d48c4 docs: follow-up for 0638bbe69c
0638bbe69c
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 12:54:30 +01:00
venkatbvc
0638bbe69c vmauth: add counter metrics for auth successes and failures (#5166)
New labels `reason="wrong basic auth creds"` and `reason="wrong auth key"` were
added to metric `vm_http_request_errors_total`  to help identify auth errors.

https://github.com/victoriaMetrics/victoriaMetrics/issues/4590

Co-authored-by: Rao, B V Chalapathi <b_v_chalapathi.rao@nokia.com>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-10-31 12:48:02 +01:00
hagen1778
aaf9e3d526 dashboards/vmalert: add new panel Missed evaluations
The new panel supposed to indicate alerting groups that miss their evaluations.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 10:35:19 +01:00
hagen1778
9866974a53 deployment/alerts: add TooManyMissedIterations alerting rule
The new rule for vmalert supposed to detect groups that miss their
evaulations due to slow queries.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 10:35:18 +01:00
hagen1778
8874b525b7 dashboards: fix Errors rate to Alertmanager filter
The panel `Errors rate to Alertmanager` had `group` label filter
applied to the expression, while the metric `vmalert_alerts_send_errors_total`
doesn't have that label. This resulted into always empty results.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-31 10:16:45 +01:00
Roman Khavronenko
ca7457d906 docs: explain motivation behind having -downsampling.period on vmselect (#5205)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 19:03:36 +01:00
Roman Khavronenko
23369321f1 docs: mention information loss when downsampling gauges (#5204)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 15:29:06 +01:00
Hui Wang
abcb21aa5e vmalert: fix alert firing state in replay mode (#5192)
fix possible missing firing states for alerting rules in replay mode
Before if one firing stage is bigger than single query request range, like rule with a big `for`, alerting rule won't able to be detected as firing.

Co-authored-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 13:54:18 +01:00
hagen1778
e964df8039 docs/troubleshooting: mention issue with un-ordered labels
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5219#issuecomment-1773441711

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 13:53:14 +01:00
hagen1778
a64b37cf24 docs: rm mention of default values for security HTTP headers
The headers, their corresponding flags are mentioned at
https://docs.victoriametrics.com/#security

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 11:46:17 +01:00
Dima Lazerka
ad839aa492 lib/httpserver: add flags to specify HSTS / Frame-Options / CSP headers for httpserver (#5111)
support `Strict-Transport-Security`, `Content-Security-Policy` and `X-Frame-Options`
HTTP headers in all VictoriaMetrics components. 
The values for headers can be specified by users via the following flags: 
`-http.header.hsts`, `-http.header.csp` and `-http.header.frameOptions`.

Co-authored-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 11:33:38 +01:00
Roman Khavronenko
29cebd82fb lib/storage: log warning about RO mode only on state change (#5191)
Before, vmstorage would log the same message each second producing excessive
amount of logs.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5159

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-30 10:52:57 +01:00
Aliaksandr Valialkin
9149353a36 app/vmui: change the order of tables at Top queries tab
Move the most interesting table - queries with the most summary time to execute - to the top
2023-10-28 11:56:16 +02:00
Aliaksandr Valialkin
613b545dfd lib/promscrape/discovery/kubernetes: propagate possible errors at newAPIWatcher() to the caller
This allows substituting FATAL panics with recoverable runtime errors such as missing or invalid TLS CA file
and/or missing/invalid /var/run/secrets/kubernetes.io/serviceaccount/namespace file.
Now these errors are logged instead of PANIC'ing, so they can be fixed by updating the corresponding files
without the need to restart vmagent.

This is a follow-up for 90427abc65
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5243
2023-10-27 20:24:46 +02:00
Hui Wang
90427abc65 lib/promscrape/discovery/kubernetes: avoid possible panic if given caFile under kubernetes.SDConfig.HTTPClientConfig is not exist (#5243)
follow up d5a599badc
2023-10-27 20:20:22 +02:00
Aliaksandr Valialkin
632d788b63 lib/promscrape/discovery/kubernetes: stop all the url watchers, which belong to a particular groupWatcher, at once
Previously url watchers for pod, service and node objects could be mistakenly closed
when service discovery was set up only for endpoints and endpointslice roles,
since watchers for these roles may start start pod, service and node url watchers
with nil apiWatcher passed to groupWatcher.startWatchersForRole().

Now all the url watchers, which belong to a particular groupWatcher, are stopped at once
when this groupWatcher has no apiWatcher subscribers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5216

The issue has been introduced in v1.93.5 when addressing https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850
2023-10-27 13:51:35 +02:00
Hui Wang
7c90ce39cb do not print redundant error logs when failed to scrape consul or no… (#5239)
* do not print redundant error logs when failed to scrape consul or nomad target
prometheus performs the same because it uses consul lib which just drops the error(1806bcb38c/api/api.go (L1134))
2023-10-27 13:31:55 +08:00
hagen1778
3aec7eb44f app/vmalert: remove unclear comment
The timestamp alignment should be applied as a last step
to keep the timestamp consistent.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-26 15:41:35 +02:00
Daria Karavaieva
b60bb1d98a model list - isolation forest (#5235)
* model list - isolation forest

* curse of dimensionality

* isol forest definition change, minor fixes

* blank line fix
2023-10-26 12:25:54 +02:00
Aliaksandr Valialkin
68b1b3c4d4 Makefile: move build commands for vmalert-tool closer to vmalert
This should simplify maintenance of Makefile commands related to vmalert.

This is a follow-up for dc28196237
2023-10-26 08:07:50 +02:00
Aliaksandr Valialkin
cdbc06a639 lib/promscrape: do not add a suggestion for enabling TCP6 in error message when the dial address is TCPv4 2023-10-25 17:57:56 -07:00
Dima Lazerka
8b41b506c2 Revert "lib/promscrape: do not add a suggestion for enabling TCP6 in error message when the dial address is TCPv4"
It broke CI (lint)

This reverts commit 5464376d16.
2023-10-25 16:24:31 -07:00
Aliaksandr Valialkin
5464376d16 lib/promscrape: do not add a suggestion for enabling TCP6 in error message when the dial address is TCPv4 2023-10-26 00:29:51 +02:00
Aliaksandr Valialkin
ac933cc423 lib/promscrape: properly track the number of updated service discovery routines inside Config.mustRestart()
This is a follow-up for d5a599badc
2023-10-26 00:06:29 +02:00
Aliaksandr Valialkin
612dcf231a lib/promauth: typo fix in the error message after d5a599badc: obtaine -> obtain 2023-10-25 23:38:00 +02:00
Aliaksandr Valialkin
d5a599badc lib/promauth: follow-up for e16d3f5639
- Make sure that invalid/missing TLS CA file or TLS client certificate files at vmagent startup
  don't prevent from processing the corresponding scrape targets after the file becomes correct,
  without the need to restart vmagent.
  Previously scrape targets with invalid TLS CA file or TLS client certificate files
  were permanently dropped after the first attempt to initialize them, and they didn't
  appear until the next vmagent reload or the next change in other places of the loaded scrape configs.

- Make sure that TLS CA is properly re-loaded from file after it changes without the need to restart vmagent.
  Previously the old TLS CA was used until vmagent restart.

- Properly handle errors during http request creation for the second attempt to send data to remote system
  at vmagent and vmalert. Previously failed request creation could result in nil pointer dereferencing,
  since the returned request is nil on error.

- Add more context to the logged error during AWS sigv4 request signing before sending the data to -remoteWrite.url at vmagent.
  Previously it could miss details on the source of the request.

- Do not create a new HTTP client per second when generating OAuth2 token needed to put in Authorization header
  of every http request issued by vmagent during service discovery or target scraping.
  Re-use the HTTP client instead until the corresponding scrape config changes.

- Cache error at lib/promauth.Config.GetAuthHeader() in the same way as the auth header is cached,
  e.g. the error is cached for a second now. This should reduce load on CPU and OAuth2 server
  when auth header cannot be obtained because of temporary error.

- Share tls.Config.GetClientCertificate function among multiple scrape targets with the same tls_config.
  Cache the loaded certificate and the error for one second. This should significantly reduce CPU load
  when scraping big number of targets with the same tls_config.

- Allow loading TLS certificates from HTTP and HTTPs urls by specifying these urls at `tls_config->cert_file` and `tls_config->key_file`.

- Improve test coverage at lib/promauth

- Skip unreachable or invalid files specified at `scrape_config_files` during vmagent startup, since these files may become valid later.
  Previously vmagent was exitting in this case.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959
2023-10-25 23:19:37 +02:00
Aliaksandr Valialkin
c22e3e7b1d lib/promscrape/discovery/kubernetes/kubeconfig_test.go: make TestParseKubeConfigSuccess test code easier to follow 2023-10-25 23:17:18 +02:00
Aliaksandr Valialkin
eed5206376 lib/promauth: properly parse string contents for ca, cert and key fields at tls_config
Previously yaml parser wasn't accepting string values for these fields,
because it was mistakenly expecting a list of uint8 values instead.
2023-10-25 23:12:21 +02:00
Aliaksandr Valialkin
4afcb2a689 lib/promscrape: move duplicate code from functions, which collect ScrapeWork lists for distinct SD types into Config.getScrapeWorkGeneric()
This removes more than 200 lines of duplicate code
2023-10-25 23:03:40 +02:00
Aliaksandr Valialkin
cb34d4440c app/vmalert/config: fix flacky test TestParseBad
It could return either `failed to read` or `failed to parse` errors depending
on whether the given url can be loaded or not under the current environment
2023-10-25 21:30:55 +02:00
Aliaksandr Valialkin
42dd71bb63 all: consistently use %w instead of %s in when error is passed to fmt.Errorf()
This allows consistently using errors.Is() for verifying whether the given error wraps some other known error.
2023-10-25 21:24:03 +02:00
Aliaksandr Valialkin
305c96e384 lib/workingsetcache: fix outdated comments for Load() and New() functions 2023-10-25 21:04:20 +02:00
hagen1778
c07909a20b app/vmalert: fix typo in tests
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-25 16:28:27 +02:00
hagen1778
eed0c3c6b0 app/vmalert: fix tests after a216fe6728
a216fe6728
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-25 16:25:26 +02:00
hagen1778
a216fe6728 app/vmalert: follow-up after c9375cac5e
c9375cac5e

Descriptions were updated in attempt to make it more clear for readers,
re-phrasing and linking missing docs.

`eval_delay` was added to tests to verify it can be unmarshalled.

`eval_delay` is now applied before timestamp alignment to make it more predictable.
Before, if delay < interval the timestamp won't be aligned.

`eval_delay` and `eval_offset` was added to API output.

`PreviouslySentSeriesToRW` converted to private `previouslySentSeriesToRW`.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-25 13:07:13 +02:00
Hui Wang
c9375cac5e vmalert: add -rule.evalDelay flag and eval_delay as group attribute (#5185)
Also mark `-datasource.lookback` as will be deprecated, see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155.
2023-10-25 11:54:18 +02:00
hagen1778
4e0a779efe deployment/alerts: update TooHighMemoryUsage annotation
The memory usage isn't measured on 5m interval anymore.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-24 09:53:44 +02:00
hagen1778
003ef3a518 deployment/alerts: make TooHighMemoryUsage more tolerable to spikes
Using `min_over_time` should reduce the amount of false positives when
component is running in near-the-threshold state. Now it should trigger
only if all collected samples were above the threshold on 10m interval.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-24 09:39:46 +02:00
hagen1778
685f9c3c98 deployment/alerts: make RemoteWriteConnectionIsSaturated expr readable
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-24 09:31:57 +02:00
Artem Navoiev
3e0236b4a3 github-actions: build search index during the doc sync (#5224)
* github-actions: build search index during the doc sync

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>

---------

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-21 12:08:32 -07:00
Alexander Marshalov
33484d3365 lib/streamaggr: respect streamAgg.dropInput with empty stream aggr config (#5213)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5207
2023-10-20 15:55:58 +02:00
krakazyabra
239849db5d docs/case-studies: update Wedos info (#5211) 2023-10-19 22:38:40 +02:00
Aliaksandr Valialkin
2f21c0c119 docs: use https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest instead of https://github.com/VictoriaMetrics/VictoriaMetrics/releases link where needed
The https://github.com/VictoriaMetrics/VictoriaMetrics/releases link may show non-latest
releases at the top, such as LTS releases or VictoriaLogs releases.
So it is better to use https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest link,
which always redirect to the latest available release of VictoriaMetrics.
2023-10-18 20:06:25 +02:00
Roman Khavronenko
b8b6e120ff app/vmselect: limit the number of parallel workers by 32 (#5195)
* app/vmselect: limit the number of parallel workers by 32

The change should improve performance and memory usage during query processing
on machines with big number of CPU cores. The number of parallel workers for
query processing is controlled via `-search.maxWorkersPerQuery` command-line flag.
By default, the number of workers is limited by the number of available CPU cores,
but not more than 32. The limit can be increased via `-search.maxWorkersPerQuery`.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* wip

- The `-search.maxWorkersPerQuery` command-line flag doesn't limit resource usage,
  so move it from the `resource usage limits` to `troubleshooting` chapter at docs/Single-server-VictoriaMetrics.md

- Make more clear the description for the `-search.maxWorkersPerQuery` command-line flag

- Add the description of `-search.maxWorkersPerQuery` to docs/Cluster-VictoriaMetrics.md

- Limit the maximum value, which can be passed to `-search.maxWorkersPerQuery`, to GOMAXPROCS,
  because bigger values may worsen query performance and increase CPU usage

- Improve the the description of the change at docs/CHANGELOG.md. Mark it as FEATURE instead of BUGFIX,
  since it is closer to a feature than to a bugfix.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-10-18 19:51:37 +02:00
Yury Molodov
ad9e2b21ee vmui: update dependencies (#5194) 2023-10-18 18:38:32 +02:00
Aliaksandr Valialkin
38b8872a47 docs/Articles.md: add an article https://rtfm.co.ua/en/victoriametrics-vmauth-proxy-authentication-and-authorization/ 2023-10-18 18:29:26 +02:00
Aliaksandr Valialkin
ad871dd9ed docs/FAQ.md: add questions on why VictoriaMetrics doesnt rebalance data and doesnt restore replication factor between vmstorage nodes 2023-10-18 18:06:26 +02:00
Aliaksandr Valialkin
ea4758f5cd docs/FAQ.md: refresh the answer to the question about how does VictoriaMetrics compare to competing solutions
- Mention Grafana Mimir
- Fix broken links
2023-10-18 09:13:57 +02:00
Github Actions
0a6932acfe Automatic update operator docs from VictoriaMetrics/operator@2c826bb (#5188) 2023-10-17 15:57:11 +02:00
hagen1778
fd2d07ba33 lib/storage: follow-up after 188cfe3a85
188cfe3a85

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5159

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-17 15:45:14 +02:00
Ilya Trefilov
188cfe3a85 lib/storage: do not create tsid if metric contains stale marker(#5069) (#5174)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5069
2023-10-17 15:30:58 +02:00
Hui Wang
e16d3f5639 fix inconsistent behaviors with prometheus when scraping (#5153)
* fix inconsistent behaviors with prometheus when scraping

1. address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959. skip job with wrong syntax in `scrape_configs` with error logs instead of exiting;
2. show error messages on vmagent /targets ui if there are wrong auth configs in `scrape_configs`, previously will print error logs and do scrape without auth header;
3. don't send requests if there are wrong auth configs in:
    1. vmagent remoteWrite;
    2. vmalert datasource/remoteRead/remoteWrite/notifier.

* add changelogs

* address review comments

* fix ut
2023-10-17 17:58:19 +08:00
hagen1778
3e2f09541e docs: mention key concepts as querying example
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5169

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-17 11:01:17 +02:00
hagen1778
c2d252c045 dashboards/vmalert: respect job and instance filters in No data errors
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-17 09:40:39 +02:00
hagen1778
edba9f6266 dashboards/vmalert: use desc sorting for tooltips on panels
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-17 09:31:09 +02:00
Aliaksandr Valialkin
f89051f83f docs/Articles.md: add newly appeared articles about VictoriaMetrics at medium.com
- https://sarthak-acoustic.medium.com/solving-metrics-at-scale-with-victoriametrics-ac9c306826c3
- https://medium.com/@seifeddinerajhi/victoriametrics-a-comprehensive-guide-comparing-it-to-prometheus-and-implementing-kubernetes-03eb8feb0cc2
2023-10-17 00:55:35 +02:00
Aliaksandr Valialkin
14f3d844fe docs/CHANGELOG.md: document v1.93.6 LTS release
See https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.6
2023-10-17 00:53:18 +02:00
Aliaksandr Valialkin
daaf2b0e61 docs/CHANGELOG.md: document v1.87.10 release
See https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.10
2023-10-16 23:25:38 +02:00
Aliaksandr Valialkin
da77f4deeb app/vmselect/promql: add labels_equal(q, "label1", "label2", ...) function
This function returns q series, which have identical values for the listed labels
"label1", "label2", ...

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5148
2023-10-16 21:50:11 +02:00
Aliaksandr Valialkin
484b5ed12f docs/MetricsQL.md: typo fix after bdb743c88d
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071
2023-10-16 21:09:36 +02:00
Aliaksandr Valialkin
6c3dd16a16 app/vmagent/remotewrite: move sas var initialization closer to the place where it is used
This makes the code sligthtly easier to understand.

This is a follow-up for 1d3d989be5

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5170
2023-10-16 20:52:56 +02:00
Aliaksandr Valialkin
bdb743c88d app/vmselect/promql: add drop_empty_series() function for dropping empty series before performing additional calculations
This can be useful in the following queries:

   drop_empty_series(temperature <= 30) default 40

This query drops temperature series with all the values bigger than 30 on the selected time range,
while replacing gaps in the remaining series with 40.

The query without drop_empty_series:

  (temperature <= 30) default 40

would leave all the temperature series with all the values bigger than 30 on the selected time range,
and replace all their values with 40. This is not what could be epxected in some cases
like here - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071
2023-10-16 20:44:56 +02:00
hagen1778
1d3d989be5 app/vmagent/remotewrite: follow-up after 4f102ff945
4f102ff945
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-16 16:00:24 +02:00
luosjde
4f102ff945 vmagent: fix streamaggr config reload bug
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5170

Authored-by: luoshaojun01 <luoshaojun01@baidu.com>
2023-10-16 15:57:24 +02:00
Aliaksandr Valialkin
b8c267075e lib/promscrape: add a link to https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets in descriptions for -promscrape.cluster.* command-line flags
This should help users figuring out the purpose of -promscrape.cluster.* command-line flags
2023-10-16 14:46:22 +02:00
Aliaksandr Valialkin
e25318e8ca vendor: run make vendor-update 2023-10-16 14:28:52 +02:00
Aliaksandr Valialkin
fc98b62760 lib/promutils, app/vmalert-tool/unittest: move promutils.Duration.ParseTime() to app/vmalert-tool/unittest.durationToTime()
The ParseTime() function looks strange, since it converts relative duration to absolute time since Unix Epoch.
In most scenarios such a conversion is used by mistake.

It is better to do not expose such a function for public use and hide it inside the package where it is needed,
e.g. inside app/vmalert-tool/unittest.

This is a follow-up for dc28196237
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2945
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4789
2023-10-16 14:19:31 +02:00
Aliaksandr Valialkin
6f98b9c221 Revert "docs victorialogs use relative links"
This reverts commit 3d7a77bf82.

Reason for revert: relative links do not work properly at GitHub code
and at GitHub wiki. For example, the following page contains broken links
before reverting this commit:

https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/docs/VictoriaLogs/CHANGELOG.md

It is always better to use absolute links thank relative links, since the page contents
can be copy-n-pasted to other pages, which are located in vastly different directories,
and all the links will remain working.
2023-10-16 13:40:43 +02:00
Aliaksandr Valialkin
56b9c0b717 docs/CaseStudies.md: typ fix: vmgent -> vmagent
This is a follow-up for f5c46b8176
2023-10-16 13:33:04 +02:00
Aliaksandr Valialkin
07150359b2 docs/vmbackup.md: clarify documentation about -deleteAllObjectVersions command-line flag
Updates 2fc7e9f47e
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5121
2023-10-16 12:09:46 +02:00
Alexander Marshalov
b248413a07 fixed error when creating a full backup using the -origin flag (#5180)
* fixed error when creating a full backup using the `-origin` flag (#5144)

* Update docs/CHANGELOG.md

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-10-16 12:02:51 +02:00
Haleygo
8b6ccad41d fix ingesting stale point, follow up fe8cc573d1 (#5179) 2023-10-16 09:05:37 +02:00
Github Actions
7abdbbc8e2 Automatic update operator docs from VictoriaMetrics/operator@79298bf (#5177) 2023-10-16 10:45:46 +08:00
Aliaksandr Valialkin
97a7128e4b app/vmselect/promql: do not use unsafe conversion from bytes slice to string when storing a value by map key
The assigned map key shouldn't change over time, otherwise the map won't work properly.

This is a follow-up for 1f91f22b5f
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087
2023-10-16 01:56:57 +02:00
Aliaksandr Valialkin
2c334ed953 app/{vmagent,vminsert}: follow-up for NewRelic data ingestion protocol support
This is a follow-up for f60c08a7bd

Changes:

- Make sure all the urls related to NewRelic protocol start from /newrelic . Previously some urls were started from /api/v1/newrelic

- Remove /api/v1 part from NewRelic urls, since it has no sense

- Remove automatic transformation from CamelCase to snake_case for NewRelic labels and metric names,
  since it may complicate the transition from NewRelic to VictoriaMetrics. Preserve all the metric names and label names,
  so users could query metrics and labels by the same names which are used in NewRelic.
  The automatic transformation from CamelCase to snake_case can be added later as a special action for relabeling rules if needed.

- Properly update per-tenant data ingestion stats at app/vmagent/newrelic/request_handler.go . Previously it was always zero.

- Fix NewRelic urls in vmagent when multitenant data ingestion is enabled. Previously they were mistakenly started from `/`.

- Document NewRelic data ingestion url at docs/Cluster-VictoriaMetrics.md

- Remove superflouos memory allocations at lib/protoparser/newrelic

- Improve tests at lib/protoparser/newrelic/*

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712
2023-10-16 00:25:25 +02:00
Aliaksandr Valialkin
ddbe713470 docs/Single-server-VictoriaMetrics.md: add a link to the original issue, which describes how to run VictoriaMetrics as Windows service
This is a follow-up for cc7d5b7bab

The original issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781 contains
up-to-date information on how to run VictoriaMetrics components as Windows service,
plus it may contain additional information about this case such as https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781#issuecomment-1708092680 ,
so it is better to refer this issue from the docs.
2023-10-15 19:33:39 +02:00
Roman Khavronenko
3594214a16 lib/vmselect: bump maxSearchQuerySize to 5MB (#5158)
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5154#issuecomment-1757216612

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5154

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-15 19:24:38 +02:00
Artem Navoiev
8dff1c696f docs fix broken links
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 15:39:49 +02:00
Artem Navoiev
a200aaf0ef docs fix broken links in operator
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 15:17:17 +02:00
Artem Navoiev
b808fe959a docs vlogs changelog fix broken link
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 15:10:28 +02:00
Artem Navoiev
065a6b020b docs fix 404 external links
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 15:03:13 +02:00
Artem Navoiev
f5c46b8176 docs fix bad links
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 14:56:06 +02:00
Artem Navoiev
3d7a77bf82 docs victorialogs use relative links
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 05:42:29 -07:00
Artem Navoiev
83d64aa621 docs/vlogs/dataingetion remoev unknown language from codeblock
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 13:54:41 +02:00
Artem Navoiev
8d3b732ce0 docs/vmlogs fix aliases
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 13:45:57 +02:00
hagen1778
60eab8e2ef docs: fix typo
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-14 07:46:00 +02:00
hagen1778
fe8cc573d1 docs: remove extra / in the end of the link
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-14 07:43:40 +02:00
Artem Navoiev
1ca0b86397 docs remove (( from the link to fix them
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-14 00:14:41 +02:00
Artem Navoiev
84cf96228f docs/alert-tools add yaml output type for the correct displaying
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-13 09:39:27 -07:00
Zakhar Bessarab
b246b0e55f deployment/logs-benchmark: add suite for Loki (#5165)
* deployment/logs-benchmark: add suite for Loki

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* deployment/logs-benchmark: update go image to 1.21.3

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* deployment/logs-benchmark: split command to run elk and loki

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-13 17:13:41 +04:00
Aliaksandr Valialkin
d1638db849 docs/Cluster-VictoriaMetrics.md: follow-up after f42c12e69a
Return back accidentally deleted text regarding KISS principles used in VictoriaMetrics development
2023-10-13 15:00:09 +02:00
Aliaksandr Valialkin
3f5a41e35e lib/license: cleanups and prettifications for log messages and docs related to licensing
- Make more clear the docs at docs/enterprise.md, so readers could figure out faster
  on how to obtain enterprise key and how to pass it to VictoriaMetrics Enterprise components.

- Fix examples at docs/enterprise.md, which were referring to non-existing `-license-file` command-line flag.
  The `-licenseFile` command-line flag must be used instead.

- Improve the description of `-license*` command-line flags, so users could understand
  faster how to use them.

- Improve the warning message, which is emitted when the deprecated -eula command-line flag is passed,
  so the user could figure out how to switch faster to -license* command-line flags.

- Disallow running VictoriaMetrics components with both -license and -licenseFile command-line flags.

- Disallow running VictoriaMetrics components when -licensFile points to an empty file.

- Consistently use the phrase "This flag is available only in Enterprise binaries" across
  all the enterprise-specific command-line flags.

- Remove unneeded level of indirection for `noLicenseMessage` and `expiredMessage` string contants
  in order to improve code readability and maintainability.

- Remove unneded `return` statements after `logger.Fatalf()` calls, since these calls exit the app and never return.

- Make sure that the info log message about successful license verification is emitted
  when the license is verified successfully. Previously the error message could be logged
  when the license payload is invalid or if it misses some required features.
2023-10-13 14:56:09 +02:00
Artem Navoiev
36b0cea3f3 docs: mention the free trial for enterprise
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
(cherry picked from commit 636d4ea196)
2023-10-13 14:31:37 +02:00
Haleygo
dc28196237 vmalert-tool: implement unittest (#4789)
1. split package rule under /app/vmalert, expose needed objects
2. add vmalert-tool with unittest subcmd

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2945
2023-10-13 13:54:33 +02:00
Artem Navoiev
98a5007d32 fix title for managed qs guide
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-13 11:10:58 +02:00
Aliaksandr Valialkin
4f4bc515e0 Revert "improve .gitignore"
This reverts commit a8345bb1b9

Reason for revert: VictoriaMetrics binaries are consistently created inside `bin` directory at the root of the repository
when running `make <vm-app>` according to https://docs.victoriametrics.com/#how-to-build-from-sources

If some dev environments create binaries inside random directories, then it is better to provide docs
at https://docs.victoriametrics.com/#how-to-build-from-sources on how to setup these IDEs, so they
consistently create binaries at bin/* directory at the root of the repository instead of trying to add
random ignore rules inside .gitignore.

As for the data directories created by VictoriaMetrics components, they may be created at random places too,
so there is little sense in trying to add ignore rules for all these directories inside .gitignore.
It is better to document that the built binaries must be consistently started from the repository root,
so data directories are created at the repository root. The .gitignore already contains rule
for blocking common data directories, which can be created by VictoriaMetrics components at the repository root.
2023-10-12 20:14:12 +02:00
Aliaksandr Valialkin
964e6ccc17 docs/Single-server-VictoriaMetrics.md: remove question mark added by accident in the commit 3d5d62e38a 2023-10-12 20:01:18 +02:00
Aliaksandr Valialkin
930a36df40 app/vmui: small UX enhancements
- Reduce vertical space usage, so more information is available on the screen without the need to scroll.
- Show information for lines with higher values at the top of the legend under the graph.
  This should simplify graph analysis when it contains many lines.
2023-10-12 19:54:19 +02:00
Aliaksandr Valialkin
0b66840520 vendor: run make vendor-update 2023-10-12 11:17:53 +02:00
Aliaksandr Valialkin
d984598e30 deployment/docker: update Go builder from Go1.21.1 to Go1.21.3
See https://github.com/golang/go/issues?q=milestone%3AGo1.21.2+label%3ACherryPickApproved
and https://github.com/golang/go/issues?q=milestone%3AGo1.21.3+label%3ACherryPickApproved
2023-10-12 09:41:41 +02:00
Aliaksandr Valialkin
31f7ef0811 app/{vmselect,vlselect}: enable caching of static contents from /vmui/static/ folder at client side
This should improve repated VMUI page load times on slow networks

See https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/
2023-10-12 09:33:40 +02:00
Zakhar Bessarab
8c4ce186d3 docs/vmbackupmaanger: reference permanent objects deletion (#5157)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-11 12:38:05 +02:00
hagen1778
d43566605b dasbhoards: fix vminsert/vmstorage/vmselect metrics filtering
Fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used
to display data from many sub-clusters with unique job names.
Before, only one specific job could have been accounted for component-specific panels,
instead of all available jobs for the component.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-11 12:09:04 +02:00
Zakhar Bessarab
2fc7e9f47e lib/backup: add -deleteAllObjectVersions command-line flag (#5147)
New flag enforces removal of all versions of the object in remote object storage.

See:
- https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5121
- https://docs.victoriametrics.com/vmbackup.html#permanent-deletion-of-objects-in-s3-compatible-storages
2023-10-10 14:13:23 +02:00
Yury Molodov
6dc5306c9b vmui: transfer Top Queries time interval #5097 (#5145)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5097
2023-10-10 13:58:39 +02:00
Dmytro Kozlov
a6d06474f4 docs/managed-victoriametrics: update documentation (#5135)
Co-authored-by: Ivan Yatskevich <ivan@yatskevich.com>
2023-10-10 13:48:22 +02:00
Nikolay
1f91f22b5f app/vmselect: reduce lock contention for heavy aggregation requests (#5119)
reduce lock contention for heavy aggregation requests
previously lock contetion may happen on machine with big number of CPU due to enabled string interning. sync.Map was a choke point for all aggregation requests.
Now instead of interning, new string is created. It may increase CPU and memory usage for some cases.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087
2023-10-10 13:45:20 +02:00
Haleygo
2aa0f5fc41 vmalert: add evalAlignment for rule group and fix evalutaion timstamp (#5066)
* vmalert: add `query_time_alignment` for rule group

1. add `eval_alignment` attribute for group which by default is true. So group rule query stamp will be aligned with interval and propagated to ALERT metrics and the messages for alertmanager;
2. deprecate `datasource.queryTimeAlignment` flag.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049
2023-10-10 12:41:19 +02:00
Dmytro Kozlov
244c887825 app/vmalert: hide sensetive info in the vmalert (#5059)
Strip sensitive information such as auth headers or passwords from datasource, remote-read, 
remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via 
`-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. 

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044
2023-10-10 11:40:27 +02:00
hagen1778
b57756734e docs: follow-up after 636d4ea196
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-10 11:25:23 +02:00
Yury Molodov
c5044cdba9 vmui: enhancement of autocomplete feature (#5051)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4993
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3006
2023-10-10 10:38:08 +02:00
Artem Navoiev
636d4ea196 docs: mention the free trial for enterprise
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-10 01:31:48 -07:00
Zakhar Bessarab
7183621d84 docs/enterprise: fix rendering of example alert (#5137)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-09 18:57:25 +04:00
Artem Navoiev
dc79b25771 docs: fix markdown for k8s managed guide
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-09 16:25:45 +02:00
hagen1778
cb13ce9147 docs: add YT video to "How to reduce expenses on monitoring"
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-09 11:14:51 +02:00
Artem Navoiev
b3cc22b159 docs: update the license flags description
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-05 10:41:30 -07:00
Artem Navoiev
a8345bb1b9 improve .gitignore
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-05 19:30:05 +02:00
hagen1778
7cf615a73b app/(vminsert|vmagent): fix label names for newrelic metrics
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-05 14:59:02 +02:00
Dmytro Kozlov
f60c08a7bd app/(vminsert|vmagent): add support for new relic infrastructure agent (#4712)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
2023-10-05 14:39:51 +02:00
Zakhar Bessarab
cc7d5b7bab docs: add example service configuration for windows (#5129)
Use service configuration example from https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781#issuecomment-1424671624

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-05 13:55:27 +02:00
Denys Holius
3d5d62e38a docs: update social links 2023-10-05 13:53:08 +02:00
Github Actions
b576acff5e Automatic update operator docs from VictoriaMetrics/operator@26d1b3b (#5130) 2023-10-05 12:43:28 +02:00
Aliaksandr Valialkin
09e6606b38 docs/VictoriaLogs/Roadmap.md: add integration with Grafana 2023-10-05 11:40:45 +02:00
Zakhar Bessarab
490694bb01 docs/enterprise: add info about license flags (#5085)
* docs/enterprise: add info about license flags

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/enterprise: add more examples on how to run enterprise components

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/enterprise: address review feedback

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/enterprise: specify release version

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/enterprise: fix image tag in example

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/enterprise: add docker-compose example, fix secret name

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-05 13:05:57 +04:00
Artem Navoiev
8b90bb3bc0 docs: victorialogs change the doc ordering.2
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-05 10:40:55 +02:00
Artem Navoiev
c606d3efde docs: victorialogs change the doc ordering
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-05 10:24:19 +02:00
Github Actions
590bd50df4 Automatic update operator docs from VictoriaMetrics/operator@5da4bf6 (#5127) 2023-10-04 18:51:53 +02:00
Aliaksandr Valialkin
10677af9df docs/VictoriaLogs/CHANGELOG.md: add release date for v0.4.1-victorialogs 2023-10-04 17:57:04 +02:00
Github Actions
cd0683a3af Automatic update operator docs from VictoriaMetrics/operator@9fb79f4 (#5124) 2023-10-04 17:03:50 +02:00
Aliaksandr Valialkin
ade64b5783 docs/vmbackup.md: mention that -filestream.disableFadvise command-line flag can be used for reducing CPU usage on systems with big number of CPU cores
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5120

This is a follow-up for 75dd7b30ba
2023-10-04 16:38:15 +02:00
Zakhar Bessarab
d863d11b5a docs/vmbackupmanager: add missing step for restore flow in k8s (#5116)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-04 16:34:18 +02:00
Aliaksandr Valialkin
75dd7b30ba lib/filestream: add -filestream.disableFadvise syscall for unconditional disabling of fadvise syscall
This may be needed in rare cases when performing backups on systems with big number of CPU cores
and big value passed to -concurrency command-line flag.

See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5120
2023-10-04 16:19:46 +02:00
Artem Navoiev
94bdc91a56 docs: fix wieghts for logs folders
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-04 15:48:54 +02:00
Artem Navoiev
ce17f81b00 docs: remove id clashes for pages, properly cover victorialogs changelog page (#5123)
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-04 15:46:28 +02:00
Artem Navoiev
c21863a956 add hugo front matter for operator docs (#5122)
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-10-04 15:27:40 +02:00
Aliaksandr Valialkin
61935e3387 deployment/docker: update VictoriaLogs from v0.4.0-victorialogs to v0.4.1-victorialogs
See https://docs.victoriametrics.com/VictoriaLogs/CHANGELOG.html#v041
2023-10-04 14:43:42 +02:00
f41gh7
960f72368a cut v0.4.1-victorialogs release 2023-10-04 14:29:41 +02:00
Aliaksandr Valialkin
2dd9b69101 docs/Single-server-VictoriaMetrics.md: follow-up for 612ec2df6b
Move the description of JSON line format into a separate chapter

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5083
2023-10-03 17:47:21 +02:00
hagen1778
de651165bd alerting: account for vmauth component for alerts ServiceDown and TooManyRestarts
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-03 16:45:33 +02:00
Alexander Marshalov
7b35eaa853 hide deprecated docs from menu (#5095) 2023-10-03 14:28:33 +02:00
Zakhar Bessarab
e330ab7ad9 docs/victorialogs: changelog followup for 650d72f3 (#5114)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-03 14:26:03 +02:00
Zakhar Bessarab
b296c8e95a lib/logstorage: fix free space check (#5113)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-10-03 12:39:41 +02:00
hagen1778
612ec2df6b docs: add more details explaining JSON line format
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5083

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-03 11:10:49 +02:00
Aliaksandr Valialkin
3d99d4b6c0 deployment: update VictoriaLogs from v0.3.0-victorialogs to v0.4.0-victorialogs
See https://docs.victoriametrics.com/VictoriaLogs/CHANGELOG.html#v040
2023-10-03 02:33:47 +02:00
Aliaksandr Valialkin
53d2e9ac98 docs/VictoriaLogs/CHANGELOG.md: cut v0.4.0-victorialogs 2023-10-03 01:46:33 +02:00
Aliaksandr Valialkin
861d91dbea deployment: update VictoriaMetrics from v1.93.5 to v1.94.0
See https://docs.victoriametrics.com/CHANGELOG.html#v1940
2023-10-03 01:40:05 +02:00
Aliaksandr Valialkin
f13a96f42c docs/CHANGELOG.md: cut v1.94.0 2023-10-02 22:33:35 +02:00
Aliaksandr Valialkin
0387b0b39f docs/Articles.md: add a video from GopherCon 2023 for Writing a TSDB from scratch: performance optimization talk 2023-10-02 22:24:57 +02:00
Aliaksandr Valialkin
5c28923c11 deployment/docker: update Alpine from 3.18.3 to 3.18.4
See https://alpinelinux.org/posts/Alpine-3.18.4-released.html
2023-10-02 21:52:38 +02:00
Aliaksandr Valialkin
5eac0cdf42 vendor: run make vendor-update 2023-10-02 21:49:16 +02:00
Aliaksandr Valialkin
55e9a9e3a0 app/{vmselect,vlselect}: run make vmui-update vmui-logs-update 2023-10-02 21:44:03 +02:00
Yury Molodov
f39045eca6 vmui: add storage for query history (#5022)
* vmui: add storage for query history

* docs/vmui: add storage for query history
2023-10-02 21:41:03 +02:00
Roman Khavronenko
a4bd73ec7e lib/promscrape: make concurrency control optional (#5073)
* lib/promscrape: make concurrency control optional

Before, `-maxConcurrentInserts` was limiting all calls to `promscrape.Parse`
function: during ingestion and scraping. This behavior is incorrect.
Cmd-line flag `-maxConcurrentInserts` should have effect onl on ingestion.

Since both pipelines use the same `promscrape.Parse` function, we extend it
to make concurrency limiter optional. So caller can decide whether concurrency
should be limited or not.

This commit makes c53b5788b4
obsolete.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* Revert "dashboards: move `Concurrent inserts` panel to Troubleshooting section"

This reverts commit c53b5788b4.

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-02 21:32:11 +02:00
Yury Molodov
133425f0c0 vmui: improve the appearance of the trace (#5091) 2023-10-02 21:24:03 +02:00
Dmytro Kozlov
34961dd4b8 app/vmagent: fix check of the DataDog agent path requests when requests have trailing slashes (#5106)
* app/vmagent: fix check of the DataDog agent path requests when requests have trailing slashes

* app/vmagent: fix CHANGELOG.md description

* wip

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-10-02 21:18:03 +02:00
Aliaksandr Valialkin
859977d591 Revert "lib/promscrape: add metric vm_promscrape_scrapes_skipped_total (#5074)"
This reverts commit 74301cdbf5.

Reason for revert:

vmagent already provides better approach for detecting slow scrape targets via the following query:

    scrape_duration_seconds / scrape_timeout_seconds > 1

This query depends on automatically generated per-target metrics.
See https://docs.victoriametrics.com/vmagent.html#automatically-generated-metrics for more details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5074
2023-10-02 20:59:56 +02:00
Aliaksandr Valialkin
71668637ce app/vmselect/promql: follow-up for 896c85a4a4
- Clarify the description of the change at docs/CHANGELOG.md
- Make sure that bitmap_*(X, NaN) returns NaN

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4996
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5021
2023-10-02 20:08:26 +02:00
Aliaksandr Valialkin
10f297e1e7 docs/Cluster-VictoriaMetrics.md: increase the minimum supported version of Go builder from 1.18 to 1.20
See the related commit 3da493ff62
2023-10-02 19:34:17 +02:00
Aliaksandr Valialkin
8dce4eb189 lib/logstorage: follow-up for 94627113db
- Move uniqueFields from rows to blockStreamMerger struct.
  This allows localizing all the references to uniqueFields inside blockStreamMerger.mustWriteBlock(),
  which should improve readability and maintainability of the code.

- Remove logging of the event when blocks cannot be merged because they contain more than maxColumnsPerBlock,
  since the provided logging didn't provide the solution for the issue with too many columns.
  I couldn't figure out the proper solution, which could be helpful for end user,
  so decided to remove the logging until we find the solution.

This commit also contains the following additional changes:

- It truncates field names longer than 128 chars during logs ingestion.
  This should prevent from ingesting bogus field names.
  This also should prevent from too big columnsHeader blocks,
  which could negatively affect search query performance,
  since columnsHeader is read on every scan of the corresponding data block.

- It limits the maximum length of const column value to 256.
  Longer values are stored in an ordinary columns.
  This helps limiting the size of columnsHeader blocks
  and improving search query performance by avoiding
  reading too long const columns on every scan of the corresponding data block.

- It deduplicates columns with identical names during data ingestion
  and background merging. Previously it was possible to pass columns with duplicate names
  to block.mustInitFromRows(), and they were stored as is in the block.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4969
2023-10-02 19:19:08 +02:00
Dmytro Kozlov
60694606f1 docs: add clarification of the retention filter usage (#5103)
docs: add clarification of the retention filter usage

Updated documentation regarding retention filter usage if duration is set lower than
`-retentionPeriod` flag value.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
2023-10-02 17:17:16 +02:00
Roman Khavronenko
74301cdbf5 lib/promscrape: add metric vm_promscrape_scrapes_skipped_total (#5074)
* lib/promscrape: add metric `vm_promscrape_scrapes_skipped_total`

add metric `vm_promscrape_scrapes_skipped_total`to show whether vmagent skips the scrapes.
This could happen if vmagent is overloaded or target is responding too slow for configured `scrape_interval`.

The follow-up commit should add a corresponding alerting rule and panel to vmagent dashboard.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* deployment/docker: add `TooManyScrapeSkips` alerting rule for vmagent

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* dashboards: add panels `Scrape duration 0.99 quantile` and `Skipped scrapes` to vmagent dashboard

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-02 17:12:12 +02:00
Aliaksandr Valialkin
7b33a27874 lib/logstorage: follow-up for 8a23d08c21
- Compare the actual free disk space to the value provided via -storage.minFreeDiskSpaceBytes
  directly inside the Storage.IsReadOnly(). This should work fast in most cases.
  This simplifies the logic at lib/storage.

- Do not take into account -storage.minFreeDiskSpaceBytes during background merges, since
  it results in uncontrolled growth of small parts when the free disk space approaches -storage.minFreeDiskSpaceBytes.
  The background merge logic uses another mechanism for determining whether there is enough
  disk space for the merge - it reserves the needed disk space before the merge
  and releases it after the merge. This prevents from out of disk space errors during background merge.

- Properly handle corner cases for flushing in-memory data to disk when the storage
  enters read-only mode. This is better than losing the in-memory data.

- Return back Storage.MustAddRows() instead of Storage.AddRows(),
  since the only case when AddRows() can return error is when the storage is in read-only mode.
  This case must be handled by the caller by calling Storage.IsReadOnly()
  before adding rows to the storage.
  This simplifies the code a bit, since the caller of Storage.MustAddRows() shouldn't handle
  errors returned by Storage.AddRows().

- Properly store parsed logs to Storage if parts of the request contain invalid log lines.
  Previously the parsed logs could be lost in this case.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4737
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4945
2023-10-02 16:52:23 +02:00
Aliaksandr Valialkin
10d9214980 lib/logstorage: run up to GOMAXPROCS flushers of old in-memory parts to disk
One flusher isn't enough under high data ingestion rate.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4775
2023-10-02 16:20:59 +02:00
Github Actions
256d800200 Automatic update operator docs from VictoriaMetrics/operator@44bdc27 (#5104) 2023-10-02 15:50:18 +02:00
Github Actions
62de314510 Automatic update operator docs from VictoriaMetrics/operator@c7125bd (#5102) 2023-10-02 14:50:08 +02:00
hagen1778
7e744f86cb app/vlinsert/loki: make fmt
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-10-02 14:07:27 +02:00
Aliaksandr Valialkin
da9ef90277 lib/logstorage: assist merging in-memory parts at data ingestion path if their number starts exceeding maxInmemoryPartsPerPartition
This is a follow-up for 9310e9f584 , which removed data ingestion pacing.
This can result in uncontrolled growth of in-memory parts under high data ingestion rate,
which, in turn, can result in unbounded RAM usage, OOM crashes and slow query performance.

While at it, consistently reset isInMerge field for parts passed to mergeParts() before returning from this function.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4775
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4828
2023-10-02 08:24:58 +02:00
Aliaksandr Valialkin
d41841c0c9 lib/{mergeset,storage}: consistently reset isInMerge field in parts passed to mergeParts() before returning from the function
While at it consistently check that the isInMerge field is set in all the parts passed to mergeParts()
2023-10-02 08:05:29 +02:00
Aliaksandr Valialkin
bf6ebc86fd docs/VictoriaLogs/CHANGELOG.md: remove duplicate lines about vl_http_request_duration_seconds metric
This is a follow-up after 8a23d08c21

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4945
2023-10-01 23:32:05 +02:00
Aliaksandr Valialkin
3ca6fea858 lib/{mergeset,storage}: perform at most one assisted merge per each call to addRows/addItems
This should reduce tail latency during data ingestion.

This shouldn't slow down data ingestion in the worst case, since assisted merges are spread among
distinct addRows/addItems calls after this change.
2023-10-01 22:19:46 +02:00
Aliaksandr Valialkin
7373d04d54 docs/Single-server-VictoriaMetrics.md: refer to active queries and top queries pages at VMUI instead of refering to the corresponding HTTP endpoints
"Active queries" and "Top queries" pages at VMUI are user-friendly than the corresponding HTTP endpoints
2023-10-01 21:54:37 +02:00
Aliaksandr Valialkin
7af9be92cc docs/vmalert.md: refer to -evaluationInterval command-line flag instead of evaluation_interval option, which isnt supported by vmalert
This is follow-up for 5c42c1218a
2023-10-01 21:54:29 +02:00
Aliaksandr Valialkin
cc3b1267f5 docs/Troubleshooting.md: describe how to optimize SLI/SLO queries with long lookbehind windows 2023-10-01 21:54:23 +02:00
Aliaksandr Valialkin
5e49b72126 docs/CHANGELOG.md: follow-up for f0e33700fc
Mention that the statistic inaccuracy is related to cardinality explorer
2023-10-01 21:33:31 +02:00
Aliaksandr Valialkin
dfb82bd126 deployment/docker/docker-compose-cluster.yml: follow-up for 4d1b572f46
Grafana and vmalert now depend on vmauth instead of individual vmselect nodes

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5057
2023-10-01 21:20:50 +02:00
Aliaksandr Valialkin
74ca4d09e7 deployment: update VictoriaMetrics version from v1.93.4 to v1.93.5
See https://docs.victoriametrics.com/CHANGELOG.html#v1935
2023-10-01 21:15:12 +02:00
Aliaksandr Valialkin
859859aa1c app/vmagent: follow-up for cfef814750
- Properly handle /insert/multitenant/api/put url for opentsdb handler at vmagent
- Document that the bug has been introduced in v1.93.2 at docs/CHANGELOG.md
- Add a link to multitenant url docs in bugfix description

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5061
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4910
2023-10-01 21:09:32 +02:00
Dmytro Kozlov
896c85a4a4 app/vmselect: fix bitmap_*() functions behavior (#5021)
Related issue: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4996

Signed-off-by: dmitryk-dk d.kozlov@victoriametrics.com

Signed-off-by: dmitryk-dk d.kozlov@victoriametrics.com
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-09-29 12:03:01 +02:00
Zakhar Bessarab
94627113db lib/logstorage: prevent from panic during background merge (#4969)
* lib/logstorage: prevent from panic during background merge

Fixes panic during background merge when resulting block would contain more columns than maxColumnsPerBlock.
Buffered data will be flushed and replaced by the next block.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/logstorage: clarify field description and comment

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-29 11:58:20 +02:00
Zakhar Bessarab
8a23d08c21 lib/logstorage: switch to read-only mode when running out of disk space (#4945)
* lib/logstorage: switch to read-only mode when running out of disk space

Added support of `--storage.minFreeDiskSpaceBytes` command-line flag to allow graceful handling of running out of disk space at `--storageDataPath`.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4737
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/logstorage: fix error handling logic during merge

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/logstorage: fix log level

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-09-29 11:55:38 +02:00
Zakhar Bessarab
9310e9f584 lib/logstorage/datadb: remove parts merge cond (#4828)
It was added in order to limit number of goroutines performing assisted merges during ingestion.
It turned out that blocking ingestion goroutines lower ingestion performance and limits overall ingestion around 40k items per seconds because of lock contention.
Removing parts merge sync.Cond allows to remove lock contention at write path and significantly improves write performance.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4775

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-29 11:50:14 +02:00
Dmytro Kozlov
f0e33700fc vmui: update information about tsdb usage in cluster version (#5004)
* vmui: update information about tsdb usage in cluster version

* vmui: cleanup

* vmui: add CHANGELOG.md

* vmui: cleanup

* vmui: update logic, move information to the visible place

* app/vmui: remove values fetch, update documentation for cardinality explorer

* app/vmui: update CHANGELOG.md
2023-09-29 11:47:45 +02:00
Zakhar Bessarab
0adec48182 doc: address review feedback
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-28 06:21:03 -07:00
Zakhar Bessarab
1a834b4210 doc: mention InfluxDB v2 HTTP API support
Address: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5076
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-28 06:21:03 -07:00
Github Actions
af53799332 Automatic update operator docs from VictoriaMetrics/operator@958ce2b (#5070) 2023-09-27 09:04:33 +08:00
hagen1778
c53b5788b4 dashboards: move Concurrent inserts panel to Troubleshooting section
Moved because this panel is related to both: scraped and ingested data.
Before, it could have give a misleading impression that it is related to ingested metrics only.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-26 14:26:40 +02:00
Alexander Marshalov
34a9d1f818 fixed ingestion via multitenant url for opentsdbhttp (#5061) (#5064) 2023-09-26 11:18:34 +02:00
Roman Khavronenko
4d1b572f46 Docker add vmauth (#5057)
* docker-compose: add vmauth to cluster env

vmauth acts as a balancer and used as an example of how to interconnect
VM components via vmauth.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* docker-compose: add vmauth to cluster env

vmauth acts as a balancer and used as an example of how to interconnect
VM components via vmauth.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-09-26 10:50:10 +02:00
Aliaksandr Valialkin
f897d5241d docs/vmagent.md: make VictoriaMetrics remove_write protocol more visible by mentioning it at the top of the page 2023-09-25 17:42:16 +02:00
Aliaksandr Valialkin
223ef96198 lib/storage: remove unused atomicSetBool function after 717c53af27 2023-09-25 17:37:24 +02:00
Aliaksandr Valialkin
89a5e27216 docs: run make docs-sync after 8e722e10ee 2023-09-25 17:35:29 +02:00
Aliaksandr Valialkin
8e722e10ee docs/CaseStudies.md: add Criteo case study
This is a follow-up for bdbe616408

See https://medium.com/criteo-engineering/victoriametrics-a-prometheus-remote-storage-solution-57081a3d8e61
2023-09-25 17:34:19 +02:00
Aliaksandr Valialkin
15dfd94f3b lib/storage: make it clear that the number of big merge workers always equals to 4
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4915#issuecomment-1733922830
2023-09-25 17:15:45 +02:00
Aliaksandr Valialkin
717c53af27 lib/storage: stop exposing vm_merge_need_free_disk_space metric
This metric confuses users and has no any useful information.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686#issuecomment-1733844128
2023-09-25 16:52:39 +02:00
Aliaksandr Valialkin
e453069dcd app/vmselect/promql: run make fmt after 3b9605dba5 2023-09-25 16:16:14 +02:00
Aliaksandr Valialkin
3b9605dba5 app/vmselect/promql: do not sort q1 or q2 results
This makes sure that `q2` series are returned after `q1` series in the same way as Prometheus does

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4763
2023-09-25 16:14:16 +02:00
Aliaksandr Valialkin
a740159541 app/vmselect/promql: completely substitute median_over_time() WITH template with regular median_over_time() rollup function
This is a follow-up for 34d7a670d0

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5034
2023-09-25 15:28:12 +02:00
Zakhar Bessarab
34d7a670d0 app/vmselect/promql: add implementation of median_over_time for rollup functions list (#5042)
`median_over_time` is handled by predefined WITH template in MetricsQL library which translates it to `quantile_over_time(0.5)`
This makes it impossble to use `median_over_time` as a usual rollup function for `aggr_over_time`.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5034

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-25 14:01:00 +02:00
Roman Khavronenko
ec50375991 docs/changelog: add link to sandbox (#5050)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-25 14:00:41 +02:00
Github Actions
33c17a5a2b Automatic update operator docs from VictoriaMetrics/operator@587ea54 (#5054) 2023-09-25 12:42:05 +02:00
hagen1778
dd98385a10 docs/articles: add link to "How to reduce expenses on monitoring" slides
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-22 17:21:32 +02:00
Aliaksandr Valialkin
72960732c2 docs/Cluster-VictoriaMetrics.md: update -help output for enterprise components 2023-09-22 13:51:32 +02:00
Zakhar Bessarab
8d99c12a7d lib/promscrape/discovery/kubernetes: supress context.Cancelled error in logs (#5048)
lib/promscrape/discovery/kubernetes: supress context.Cancelled error in logs

It is possible that context.Cancelled will appear after k8s watcher was closed due to reload(see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850).

Logging an error misinforms user and looks like vmagent discovery will stop working even though this does not affect discovery.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-22 13:01:33 +02:00
Aliaksandr Valialkin
3140ef7261 lib/storage: log fatal error inside searchMetricName() instead of propagating it to the caller
This simplifies the code a bit at searchMetricName() and searchMetricNameWithCache() call sites

This is a result of investigating https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4972
2023-09-22 11:41:06 +02:00
Zakhar Bessarab
455077cd67 docs/vmbackup: update docs for different authentication options, add examples (#5046)
Updates: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5023

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-22 11:19:19 +02:00
Github Actions
71a17dfcbe Automatic update operator docs from VictoriaMetrics/operator@9d65e09 (#5040) 2023-09-21 13:20:55 +02:00
Zakhar Bessarab
760cdcec68 lib/backup: fix issue with inconsistent copying of appliedRetention.txt (#5027)
* lib/backup: fix issue with inconsistent copying of appliedRetention.txt

appliedRetention.txt can be modified in place, so it should be always copied just the same as parts.json

Updates: https://github.com/victoriaMetrics/victoriaMetrics/issues/5005
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs: add changelog entry for appliedRetention.txt copying fix

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-21 11:25:19 +02:00
Aliaksandr Valialkin
f16cbc726e app/vmauth: consistently use '%w' for formatting errors in fmt.Errorf() 2023-09-21 11:04:13 +02:00
Roman Khavronenko
462c918251 app/vmauth: update config reload routine (#5019)
* expose metrics `vmauth_config_last_reload_*` for tracking the state of config reloads, similarly to vmagent/vmalert components.
* do not print logs like `SIGHUP received...` once per configured `-configCheckInterval` cmd-line flag. This log will be printed only if config reload was invoked manually.
*  prevent configuration reloading if there were no changes in config. This improves memory usage when `-configCheckInterval` cmd-line flag is configured and config has extensive list of regexp expressions requiring additional memory on parsing.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-20 15:04:52 +02:00
hagen1778
0c60228fea dashboards/victoriametrics: account for instance filter in annotations
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-20 14:50:03 +02:00
Zakhar Bessarab
bea3431ed1 lib/storage/partition: add check to ensure parts exist on disk (#5017)
* lib/storage/partition: add check to ensure parts exist on disk

If part exists in parts.json but is missing on disk there will be a misleading error similar to "unexpected number of substrings in the part name".

This change forces verification of part existence and throws a correct error in case it is missing on disk.

Such issue can be result of https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005 or disk corruption.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/storage/partition: use filepath.Join instead of string concatenation

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/storage/partition: add action points for error message

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* all: add a check for missing part in lib/mergeset and lib/logstorage

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-19 11:17:41 +02:00
Aliaksandr Valialkin
28aed4d098 docs/CHANGELOG.md: publish changes for v1.93.5 2023-09-19 10:50:25 +02:00
hagen1778
bdbe616408 docs/articles: add https://medium.com/criteo-engineering/victoriametrics-a-prometheus-remote-storage-solution-57081a3d8e61
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-19 10:03:40 +02:00
hagen1778
7b99781b09 build(deps): revert version change for codecov/codecov-action from 4 to 3
https://github.com/codecov/codecov-action/issues/1089
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-19 09:19:42 +02:00
Aliaksandr Valialkin
582f1f8fda docs/CHANGELOG.md: clarify the description of bugfixes at f7dda12b4d and b6ad581b45
This is a follow-up for 8b01bc4a5c

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4999
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5009
2023-09-19 00:22:32 +02:00
Aliaksandr Valialkin
e9647bb669 app/vlinsert: follow-up for d570763c91
- Switch from summary to histogram for vl_http_request_duration_seconds metric.
  This allows calculating request duration quantiles across multiple hosts
  via histogram_quantile(0.99, sum(vl_http_request_duration_seconds_bucket) by (vmrange)).
- Take into account only successfully processed data ingestion requests
  when updating vl_http_request_duration_seconds histogram.
  Failed requests are ignored, since they may significantly skew measurements.
- Clarify the description of the change at docs/VictoriaLogs/CHANGELOG.md.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4934
2023-09-19 00:02:43 +02:00
Aliaksandr Valialkin
30a645cd82 lib/promscrape/discovery/kubernetes: follow-up after 03fece44e0
- Properly update vm_promscrape_discovery_kubernetes_url_watchers
  and vm_promscrape_discovery_kubernetes_group_watchers metrics after config changes

- Properly stop goroutine responsible for recreating scrapeWorks after the corresponding urlWatcher is stopped

- Log the event when urlWatcher is stopped in order to simplify debugging

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4861
2023-09-18 23:23:45 +02:00
Aliaksandr Valialkin
03fece44e0 lib/promscrape/discovery/kubernetes: wait for 10 seconds before checking whether the urlWatcher must be stopped
This should prevent from excess urlWatcher churn on config reload, since it leads to removal of all the apiWatchers
before creating new apiWatchers. So, every config reload would lead to stopping of all the previous urlWatchers
and starting new urlWatchers.

The new logic gives 10 seconds for config reload before stopping unused urlWatchers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4861
2023-09-18 17:45:12 +02:00
Aliaksandr Valialkin
76af32d869 lib/promscrape/discovery/kubernetes: follow-up after eeb862f3ff
- Move the bugfix description to the correct place in docs/CHANGELOG.md
- Prevent from logging of 'context canceled' errors after the url watcher is stopped,
  since these errors are expected and may confuse users.
- Remove unused urlWatcher.refCount field.
- Remove unused urlWatcher.close() method.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850
2023-09-18 17:06:39 +02:00
Aliaksandr Valialkin
4d01bc6d52 lib/backup: properly copy parts.json files inside indexdb directory additional to data directory
This is a follow-up for 264ffe3fa1

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5006
2023-09-18 16:16:50 +02:00
Aliaksandr Valialkin
f93a7b8457 lib/backup/common: consistently use canonical path with / directory separators at Part.Path
Previously Part.Path could contain `\` directory separators on Windows OS,
which could result in incorrect filepaths generation when making backups at object storage.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704

This is a follow-up for f2df8ad480
2023-09-18 16:15:34 +02:00
Aliaksandr Valialkin
f267733d9a app/vlinsert/insertutils: cosmetic changes after 8d3e574c31 2023-09-18 11:59:46 +02:00
dependabot[bot]
f32711e614 build(deps): bump codecov/codecov-action from 3 to 4 (#5011)
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 4.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/codecov/codecov-action/compare/v3...v4)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-18 09:27:12 +02:00
Nikolay
8b01bc4a5c docs: reflect recent changes at change logs (#5015) 2023-09-18 08:22:10 +02:00
crossoverJie
d570763c91 app/vlinsert: Add vl_http_request_duration_seconds metrics (#4934) 2023-09-16 15:10:29 +02:00
Zakhar Bessarab
eeb862f3ff lib/promscrape/discovery/kubernetes: fix leaking api watcher (#4861)
* lib/promscrape/discovery/kubernetes: fix leaking api watcher

goroutine which was polling k8s API had no execution control. This leaded to leaking goroutines during config reload.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/promscrape/discovery/kubernetes: use reference counting for urlWatcher cleanup

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/promscrape/discovery/kubernetes: remove waitgroup sync for goroutines polling API server

This is unnecessary since context will is cancelled and new requests will not be sent. Also, using waitgroup will increase time required to perform reload which might result in missed scrapes.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/promscrape/discovery/kubernetes: clarify comment

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* Apply suggestions from code review

* lib/promscrape/discovery/kubernetes: address review feedback

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-09-15 19:40:13 +02:00
Konstantin
f7dda12b4d app/vmselect: return +Inf as null in graphite render api (#5009)
Signed-off-by: Konstantin Kulikov <k.kulikov2@gmail.com>
2023-09-15 19:23:59 +02:00
faceair
b6ad581b45 lib/storage: remove ForceMergeAllParts internal loop (#4999)
Signed-off-by: faceair <git@faceair.me>
2023-09-15 19:04:54 +02:00
Zakhar Bessarab
264ffe3fa1 lib/backup: force copying of parts.json (#5006)
* lib/backup: force copying of parts.json

Copying of parts.json is required because `part.key()` comparison can create same key value for files with different contents. This will result in inconsistent backup being created or restored.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/backup: ensure parts.json is only copied once

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-09-15 19:04:38 +02:00
Zakhar Bessarab
8d3e574c31 app/vlinsert: add flag to limit amount of fields per line (#4976)
Adding limit on ingestion allows to avoid issues like this one https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762
Such issues are often caused by misconfigurtion on log persing/ingestion side and preventing such rows from being ingested allows to avoid performance implications created by storing such log rows.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-15 15:18:38 +02:00
Zakhar Bessarab
2a362e7397 docs: add changelog entry for downsampling.period and dedup.minScrapeInterval verification (#5000)
* docs: add changelog entry for downsampling.period and dedup.minScrapeInterval verification

- added changelog entry
- documented requirements for dedup.minScrapeInterval and downsampling.period being multiples of each other

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs: `make docs-sync`

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-15 15:14:16 +02:00
Artem Navoiev
f04eb762c1 add annotation to VictoriaLogs dashboards - restarts and version change (#5008)
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-09-15 15:12:23 +02:00
Dmytro Kozlov
d5f9619984 vmagent: add validation of MetricsQL functions (#4991)
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-15 13:15:23 +02:00
Aliaksandr Valialkin
a09c680170 lib/storage: handle fatal errors inside indexSearch.getTSIDByMetricID() instead of returning them to the caller
This simplifies the code a bit at caller side
2023-09-15 11:55:42 +02:00
Github Actions
9f3f085d7f Automatic update operator docs from VictoriaMetrics/operator@49d245d (#5002) 2023-09-13 22:33:00 +08:00
Github Actions
d6cdfd231a Automatic update operator docs from VictoriaMetrics/operator@a9c54f4 (#4998) 2023-09-13 06:03:46 +08:00
Aliaksandr Valialkin
cbcfbaf488 .github/workflow: remove automatic creation of pull request at github.com/VictoriaMetrics/ops repository on new tag
This automation doesn't work as intended on LTS releases, bugfix releases and custom releases,
since it assumes every new tag is related only to new release.

Also the github.com/VictoriaMetrics/ops repository may contain manually set custom tags
for VictoriaMetrics components (for example, for testing the latest bugfixes or features),
which are overwritten by the generated pull request.

The way to go is to manually update tags at github.com/VictoriaMetrics/ops repository when needed
instead of trying to automate this process.
2023-09-11 23:50:44 +02:00
Alexander Marshalov
eebef296d4 Updates in release guide (#4956)
new helm release flow and operator release flow

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-09-11 09:55:30 +02:00
Github Actions
9902418524 Automatic update operator docs from VictoriaMetrics/operator@21f7203 (#4988) 2023-09-11 09:40:46 +02:00
Aliaksandr Valialkin
151f363552 docs/CHANGELOG.md: document v1.87.9 2023-09-10 21:41:23 +02:00
Aliaksandr Valialkin
a7409500fc deployment/docker: update VictoriaMetrics from v1.93.3 to v1.93.4
See https://docs.victoriametrics.com/CHANGELOG.html#v1934
2023-09-10 19:49:59 +02:00
Aliaksandr Valialkin
bb8eda0b0f docs/CHANGELOG.md: document v1.93.4 2023-09-10 19:47:38 +02:00
Artem Navoiev
fef0c232e8 Update VL daashboard. Add Resource Section, add ds and job filters, a… (#4981)
* Update VL daashboard. Add Resource Section, add ds and job filters, add metric collection in docker compose from victorialogs, fix networkigs usage in docker compose

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>

* add vl dashboard to docker compose

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>

* add vl dashboard to docker compose

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>

---------

Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-09-10 15:04:07 +02:00
Aliaksandr Valialkin
9bdcf483a7 docs/vmrestore.md: run make docs-sync after bf4936bbf0 2023-09-08 23:28:44 +02:00
Aliaksandr Valialkin
0bbc6a5b43 app/vmagent/remotewrite: fix data race when extra labels are added to samples before sending them to multiple remote storage systems
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4972
2023-09-08 23:24:00 +02:00
Aliaksandr Valialkin
a315694dd9 app/vmauth: add ability to specify response status codes for retrying requests during load-balancing
Response status codes for retrying can be specified via retry_status_codes list

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4893
2023-09-08 23:23:15 +02:00
Zakhar Bessarab
bf4936bbf0 docs: sync description for license flags (#4977)
- update eula flag to add deprecation notice
- add new license flags description

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-08 17:09:34 +04:00
Roman Khavronenko
6351d07da8 vmalert: correctly add duplicated params to the query (#4955)
Fix the bug when Group's `params` fields with multiple values were
overriding each other instead of adding up.
The bug was introduced in this commit eccecdf177
 starting from v1.91.1 https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.91.1

 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4908

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-08 09:32:48 +02:00
Aliaksandr Valialkin
b80d338287 app/vmauth: retry requests at other backends on 5xx response status codes
This should allow implementing high availability scheme described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561

See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4893
2023-09-08 00:46:37 +02:00
Aliaksandr Valialkin
dd10f94951 app/vmselect: return 503 status code when partial responses are denied and some of vmstorage nodes are temporarily unavailable
This should help detecting this case and automatic retrying the query at healthy cluster replica
in another availability zone.

This commit is needed as a preparation for automatic query retry at another backend at vmauth on 5xx errors
as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561
2023-09-07 16:11:39 +02:00
Aliaksandr Valialkin
9de440c803 lib/logger: increase the maximum log arg size from 200 to 500
The 200 chars limit has been appeared too small for typical log messages emitted by VictoriaMetrics components

This is a follow-up for 87fea7d8ac
2023-09-07 16:11:08 +02:00
Aliaksandr Valialkin
87fea7d8ac lib/logger: limit the maximum arg length, which can be emitted to log lines
This should prevent from emitting too long lines when too long args are passed to logger.* functions.
For example, too long MetricsQL queries or too long data samples.
2023-09-07 15:22:46 +02:00
Github Actions
78dfbe1622 Automatic update operator docs from VictoriaMetrics/operator@83c07ed (#4970) 2023-09-07 15:09:28 +02:00
dependabot[bot]
9ddb2d8010 build(deps): bump actions/checkout from 3 to 4 (#4950)
Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-07 13:07:56 +02:00
Aliaksandr Valialkin
3151adda2a vendor: return back the latest version of golang.org/x/exp/slices, which works correctly with github.com/prometheus/prometheus/model/labels 2023-09-07 13:01:39 +02:00
Aliaksandr Valialkin
cf6fc2a6b7 vendor: run make vendor-update 2023-09-07 13:01:39 +02:00
Aliaksandr Valialkin
3da493ff62 go.mod: increase the minimum supported Go version from Go1.19 to Go1.20 2023-09-07 13:01:38 +02:00
Github Actions
72d3063bef Automatic update operator docs from VictoriaMetrics/operator@9c461d2 (#4968) 2023-09-07 12:26:02 +02:00
Aliaksandr Valialkin
9bccc5aab2 docs/CHANGELOG.md: return back accidentally deleted line at 45c0e4bb31 2023-09-07 12:03:04 +02:00
Aliaksandr Valialkin
2dc33e0ddc all: update Go builder from Go1.21.0 to Go1.21.1
See https://github.com/golang/go/issues?q=milestone%3AGo1.21.1+label%3ACherryPickApproved
2023-09-07 11:36:16 +02:00
Aliaksandr Valialkin
5f85dd7f80 docs/CHANGELOG.md: clarify the scope of recent bugfixes 2023-09-07 11:25:11 +02:00
Aliaksandr Valialkin
56a94d3b63 SECURITY.md: substitute v1.79.x LTS releases with v1.93.x LTS releases, because v1.79.x is deprecated 2023-09-07 11:18:54 +02:00
Aliaksandr Valialkin
448baf12a3 deployment/docker: properly build armv5 production builds for GOARCH=arm
Pass GOARM=5 when building GOARCH=arm production builds, since the default value for this env var
has been changed to GOARM=6 since Go1.21.0.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965
and https://github.com/golang/go/issues/62475
2023-09-07 11:18:53 +02:00
hagen1778
40c94b26dd docs: mention that quantiles can't be used in sharded mode
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-07 10:06:09 +02:00
Haleygo
45c0e4bb31 vmalert: add eval_offset for group (#4693)
Adds `eval_offset` attribute for Groups. 
If specified, Group will be evaluated at the exact time offset on the range of [0...evaluationInterval]. 
The setting might be useful for cron-like rules which must be evaluated at specific moments of time. 

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409

Signed-off-by: Haley Wang <pipilong.25@gmail.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
2023-09-06 16:29:59 +02:00
Aliaksandr Valialkin
138e02da37 docs/CHANGELOG.md: document the bugfix at 7db72dd7e6
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947
2023-09-06 12:17:21 +02:00
Aliaksandr Valialkin
081476f3d6 app/vmselect: run make vmui-update 2023-09-06 10:29:11 +02:00
Yury Molodov
7b92f1d038 vmui: fix render heatmap (#4957) 2023-09-06 10:26:45 +02:00
Aliaksandr Valialkin
2e04ddbd32 docs/CaseStudies.md: update Grammarly case study with the newly published article https://www.grammarly.com/blog/engineering/monitoring-with-victoriametrics/
Follow-up c0246b2e17
2023-09-05 17:03:40 +02:00
Artem Navoiev
7be3848ee6 use correct abbriviation for ESA legal doc
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-09-05 11:01:45 +02:00
Artem Navoiev
42baf5fe3f change link to the enterprise legal doc
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-09-05 01:53:36 -07:00
Github Actions
e04c435875 Automatic update operator docs from VictoriaMetrics/operator@b63f6e9 (#4946) 2023-09-04 18:19:38 +02:00
hagen1778
f9e47a9abe docs: fix broken link in vmctl references
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-04 12:45:46 +02:00
Yury Molodov
d19072a2d9 feat: add the option to see the latest queries (#4718) (#4759)
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-04 11:29:11 +02:00
hagen1778
c0246b2e17 docs: add How Grammarly Improved Monitoring by Over 10x with VictoriaMetrics
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-09-04 10:18:35 +02:00
Aliaksandr Valialkin
24d61bf193 lib/flagutil: add Duration.Milliseconds() convenience function after 0c7d46d637
This function is a faster replacement for Duration.Duration().Milliseconds() call
2023-09-03 10:56:44 +02:00
Dima Lazerka
0c7d46d637 flagutil: Make .Msecs private (#4906)
* Introduce flagutil.Duration

To avoid conversion bugs

* Fix tests

* Clarify documentation re. month=31 days

* Add fasttime.UnixTime() to obtain time.Time

The goal is to refactor out the last usage of `.Msecs`.

* Use fasttime for time.Now()

* wip

- Remove fasttime.UnixTime(), since it doesn't improve code readability and maintainability
- Run `make docs-sync` for syncing changes from README.md to docs/ folder
- Make lib/flagutil.Duration.Msec private
- Rename msecsPerMonth const to msecsPer31Days in order to be consistent with retention31Days

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-03 10:33:37 +02:00
Github Actions
26cee36499 Automatic update operator docs from VictoriaMetrics/operator@add9045 (#4941) 2023-09-02 17:19:32 +02:00
Github Actions
ff4bc9d18e Automatic update operator docs from VictoriaMetrics/operator@c32931b (#4940) 2023-09-02 16:20:46 +02:00
Aliaksandr Valialkin
e009550312 deployment: update VictoriaMetrics tag from v1.93.2 to v1.93.3
See https://docs.victoriametrics.com/CHANGELOG.html#v1933
2023-09-02 11:16:23 +02:00
Aliaksandr Valialkin
7a716095dc docs/CHANGELOG.md: document 1.93.3 release 2023-09-02 10:21:20 +02:00
Aliaksandr Valialkin
82ccae1c02 docs/CHANGELOG.md: document v1.87.8 2023-09-02 01:54:07 +02:00
Nikolay
b9a5ea03fa lib/vmselectapi: do not send empty label names for labelNames request (#4936)
* lib/vmselectapi: do not send empty label names for labelNames request
it breaks cluster communication, since vmselect incorrectly reads request buffer, leaving unread data on it
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4932

* typo fix

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-01 23:26:43 +02:00
Aliaksandr Valialkin
8632683990 docs/CHANGELOG.md: document bugfix at 7c19d01e9a
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4870
2023-09-01 18:00:12 +02:00
Aliaksandr Valialkin
377627c4c9 deployment: update VictoriaMetrics tag from v1.93.1 to v1.93.2
See https://docs.victoriametrics.com/CHANGELOG.html#v1932
2023-09-01 17:35:48 +02:00
Aliaksandr Valialkin
8847fbd34f docs/CHANGELOG.md: document v1.93.2 2023-09-01 17:33:01 +02:00
Aliaksandr Valialkin
f075977045 app/vmselect: run make vmui-update after c112dd7367 2023-09-01 10:54:06 +02:00
Yury Molodov
c112dd7367 vmui: support for Prometheus data on the cardinality page (#4713)
* feat: add cardinality support for prometheus (#4320)

* docs/CHANGELOG.md: add cardinality support for prometheus

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-09-01 10:51:44 +02:00
Zakhar Bessarab
4e5a68ed08 deployment/docker: add VictoriaLogs (#4929)
* deployment/docker: add VictoriaLogs configuration

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* deployment/docker/victorialogs: remove outdated comment

It was added in order to indicate that it is required to build VictoriaLogs manually before starting it at the time there was no public release available.
Currently, there is a public tag and it is not required to build it from sources.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* deployment/docker/victorialogs/fluentbit: include log path in stream configuration

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* deployment/docker: add reference to monitoring setup for VictoriaLogs

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-09-01 10:45:43 +02:00
Aliaksandr Valialkin
edee262ecc Makefile: update golangci-lint from v1.51.2 to v1.54.2
See https://github.com/golangci/golangci-lint/releases/tag/v1.54.2
2023-09-01 10:16:42 +02:00
Dima Lazerka
e0e856d2e7 Add flagutil.Duration to avoid conversion bugs (#4835)
* Introduce flagutil.Duration

To avoid conversion bugs

* Fix tests

* Comment why not .Seconds()
2023-09-01 09:27:51 +02:00
Aliaksandr Valialkin
4bcc086965 app/vmauth: add tests for ResponseHeaders
This is a follow-up for b18eed3427

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4825
2023-09-01 09:21:12 +02:00
Aliaksandr Valialkin
99384b96ba app/vmctl: remove superflouos whitespace after 2853fac3f5 2023-09-01 08:59:40 +02:00
Alexander Marshalov
b18eed3427 vmauth: added ability to set and remove response headers (#4825) (#4914)
* added ability to set and clear response headers (#4825)

Signed-off-by: Alexander Marshalov <_@marshalov.org>

* added ability to set and clear response headers (#4825)

Signed-off-by: Alexander Marshalov <_@marshalov.org>

* fix review comment

Signed-off-by: Alexander Marshalov <_@marshalov.org>

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-08-31 14:26:51 +02:00
Dmytro Kozlov
2853fac3f5 docs: follow up after 939952068b (#4923) 2023-08-31 10:53:26 +02:00
Aliaksandr Valialkin
2a3fa14ad7 docs/VictoriaLogs/CHANGELOG.md: document 1c42154785
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762
2023-08-30 16:37:36 +02:00
Zakhar Bessarab
1c42154785 app/vlinsert/loki: add handler for healthcheck endpoint (#4885)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-30 16:32:28 +02:00
Nikolay
dc4b974a48 app/vminsert: fixes readonly check (#4892)
* app/vminsert: fixes readonly check
previously vminsert doesn't check readOnly state for vmstorage, since check was never performed for nil buffer
In this case every 30 second storage node loss readonly state and received some data.
It caused re-routing and possible slow down for ingestion
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4870

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-30 16:25:20 +02:00
Nikolay
00685b627f lib/promscrape/k8s_sd: set resourceVersion to 0 by default for watch … (#4901)
* lib/promscrape/k8s_sd: set resourceVersion to 0 by default for watch requests
it must reduce load for kubernetes ETCD servers. Since requests without resourceVersion performs force cache sync at kubernetes API server with ETCD
more info at https://kubernetes.io/docs/reference/using-api/api-concepts/\#semantics-for-watch
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4855

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-30 16:03:41 +02:00
Github Actions
3222780707 Automatic update operator docs from VictoriaMetrics/operator@18e080e (#4921)
Co-authored-by: Alexander Marshalov <_@marshalov.org>
2023-08-30 15:09:18 +02:00
Aliaksandr Valialkin
1c0e065216 app/vmselect/promql: add support for _ delimiters in numeric values
For example, 1_234_567_890 is equivalent to 1234567890,
while 1.234_567_890 is equivalent to 1.234567890
2023-08-30 14:33:41 +02:00
Aliaksandr Valialkin
1bba4c5118 lib/auth: add NewTokenPossibleMultitenant() for parsing auth token, which can be multitenant
Disallow parsing multitenant token at auth.NewToken().

Use auth.NewTokenPossibleMultitenant() at vminsert only. All the other callers should call auth.NewToken(),
since they do not support multitenant token.

This is a follow-up for f0c06b428e

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4910
2023-08-30 14:17:55 +02:00
Zakhar Bessarab
137fa19d9c app/vmselect: fix panic when using /select/multitenant endpoint (#4912)
app/vmselect: fix panic when using `/select/multitenant` endpoint

Such requests must be rejected as not found since vmselect does not support multitenant endpoint.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4910

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-30 14:16:09 +02:00
Github Actions
3f7bde5bac Automatic update operator docs from VictoriaMetrics/operator@d0c3ec7 (#4909) 2023-08-29 22:20:31 +02:00
Nikolay
c37d7dd567 deployment/docker: disable provenance in buildx (#4911)
* deployment/docker: disable provenance in buildx
it must fix an issue with multi-platform manifest generation
at buildx >= 0.10 backward compatibility was broken and generated image cannot be used with docker systems that doesn't support oci.
disabling attestat temporary fixes it.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4907
https://docs.docker.com/build/attestations/slsa-provenance/

* Update docs/CHANGELOG.md

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-29 16:29:14 +02:00
Aliaksandr Valialkin
3453e0c455 docs/VictoriaLogs/Roadmap.md: add alerting to TODO list 2023-08-29 13:26:41 +02:00
Aliaksandr Valialkin
3e963debf8 vendor: run make vendor-update 2023-08-29 13:15:22 +02:00
Aliaksandr Valialkin
039b8667c4 lib/proxy: consistently use gopkg.in/yaml.v2 across all the code 2023-08-29 13:12:38 +02:00
Aliaksandr Valialkin
c7887296e5 Makefile: cleanup bin/ directory at the beginning of make publish-release command
This is needed in order to prevent from non-build artifacts to be uploaded to Github release page
2023-08-29 13:10:51 +02:00
Aliaksandr Valialkin
281a37f6f2 app/{vmselect,vlselect}: run make vmui-update vmui-logs-update after recent changes to app/vmui 2023-08-29 12:58:44 +02:00
Aliaksandr Valialkin
d087334049 app/{vminsert,vmselect}: follow-up after 2b7b3293c1
- Document the change at docs/CHANGELOG.md
- Set the default value for -vmstorageUserTimeout to 3 seconds. This is much better
  than the 0 value, which means that TCP connection to unreachable vmstorage could block
  for up to 16 minutes.
- Document -vmstorageUserTimeout at docs/Cluster-VictoriaMetrics.md
2023-08-29 12:18:53 +02:00
Aliaksandr Valialkin
5f182cc2c2 docs/VictoriaLogs/CHANGELOG.md: document the fix 8d50032dd6
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4895
2023-08-29 11:28:45 +02:00
Aliaksandr Valialkin
317a273c6d lib/logstorage: eliminate data race when clearing s.ptwHot after deleting the corresponding partition
The previous code could result in the following data race:
1. The s.ptwHot partition is marked to be deleted
2. ptw.decRef() is called on it
3. ptw.pt is set to nil
4. s.ptwHot.pt is accessed from concurrent goroutine, which leads to panic.

The change clears s.ptwHot under s.partitionsLock in order to prevent from the data race.

This is a follow-up for 8d50032dd6

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4895
2023-08-29 11:09:55 +02:00
Roman Khavronenko
e8db78eaa4 dashboards: provide copies of Grafana dashboards alternated with Vict… (#4905)
dashboards: provide copies of Grafana dashboards alternated with VictoriaMetrics datasource

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-29 11:06:55 +02:00
crossoverJie
8d50032dd6 lib/logstorage: Set ptwHot to nil when the partition pointed by ptwHot is dropped (#4902) 2023-08-29 11:01:19 +02:00
Aliaksandr Valialkin
379b92cc10 docs/CHANGELOG.md: add links to stream parsing mode in descriptions for 6e8611f301 and 6788704152 2023-08-29 10:47:32 +02:00
Aliaksandr Valialkin
154c691f47 docs/CHANGELOG.md: remove unneeded utm_source and utm_medium query args in the link to Google Lighthouse
Remove the line about consistent rounding of values in vmui, since it looks like it has been broken and needs to be returned back.
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4872#issuecomment-1696981947 for details.

This is a follow-up for e865989fa9
2023-08-29 10:27:28 +02:00
hagen1778
5d848363f0 lib/promscrape: follow-up after eabcfc9bcd
`-promscrape.cluster.membersCount` by default should be `1`, like every
single vmagent is a cluster of one member on its own.
The change additionally validates that user can't set `-promscrape.cluster.membersCount`
to value lower than `1`.

eabcfc9bcd
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-29 10:04:57 +02:00
Haleygo
eabcfc9bcd fix clusterMembersCount check (#4900) 2023-08-29 15:58:24 +08:00
Aliaksandr Valialkin
9d2260ed3c app/vmagent/remotewrite: do not retry request immediately on io.ErrUnexpectedEOF, since this error isn't returned on stale connection
Also, mention the https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139 in comments to the code
in order to simplify further maintenance of this code.

This is a follow-up for 992a1c0a3a
2023-08-29 09:48:28 +02:00
Aliaksandr Valialkin
0e31415b34 docs/CHANGELOG.md: remove another blank line in order to reduce the difference with lts-1.93 branch 2023-08-29 09:48:27 +02:00
hagen1778
a20234d2d0 docs: fix typo for deduplication description in k8s guide
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-28 14:45:00 +02:00
Aliaksandr Valialkin
58a6bb7bd1 docs/CHANGELOG.md: remove superflouos blank lines 2023-08-28 10:00:27 +02:00
Aliaksandr Valialkin
5b96a96535 docs/CHANGELOG.md: move the bugfix line into correct place after ddf87b32ed 2023-08-28 09:59:41 +02:00
Aliaksandr Valialkin
1b6d37b8e2 docs/CHANGELOG.md: explicitly mention that the bug in 1.93.0 may lead to data loss
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4873
2023-08-28 09:52:29 +02:00
Aliaksandr Valialkin
d8a4f01fe9 docs/CHANGELOG.md: return back the line accidentally deleted at 6abd575cbe
The line has been originally added in 481a2c70fd
2023-08-28 09:46:42 +02:00
Aliaksandr Valialkin
8aa29e9a84 docs/stream-aggregation.md: use 5m instead of 300 in the example query for rate() calculation from "increase" results
This makes the query easier to read and understand

Follow-up for 0df506de54
2023-08-28 09:36:19 +02:00
Aliaksandr Valialkin
8e5958448b docs/vmauth.md: consistently prepend command-line flags with a single - 2023-08-28 09:28:04 +02:00
Aliaksandr Valialkin
e4fff13697 docs/CHANGELOG.md: clarify the description of b7d07e5acf
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4817
2023-08-28 09:12:29 +02:00
crossoverJie
cde5029bce lib/logstorage: add nil check for ptwHot.pt (#4896) 2023-08-27 01:24:26 +02:00
Github Actions
501d8e1978 Automatic update operator docs from VictoriaMetrics/operator@d40882c (#4894)
Co-authored-by: Alexander Marshalov <_@marshalov.org>
2023-08-25 19:58:31 +02:00
Daria Karavaieva
278d3c20a4 vmanomaly guide docker compose run fix according licensing (#4876)
* docker-compose run fix according to licensing

* public v1.5.0 of vmanomaly

* new file list image

* 1.93.1 version of VM
2023-08-25 18:28:39 +02:00
Github Actions
ef6468584c Automatic update operator docs from VictoriaMetrics/operator@e00fe21 (#4891) 2023-08-25 15:49:00 +02:00
Zakhar Bessarab
6e8611f301 lib/promscrape/client: sync timeout for HostClient and http.Client (#4889)
Initially, stream parse mode was reading data from response and parsing it on flight. This was causing longer delay to read the whole response and required increasing timeout value to allow data processing while reading. So that 908e35affd increased timeout value to fix this.

But after 74c00a8762 response in stream parse mode is saved into memory and then parsed eliminating necessity of having timeout value higher that for usual scrape.

Updates: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4847
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-25 15:47:11 +02:00
hagen1778
e865989fa9 docs: follow-up after 72167a697e
72167a697e
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-25 15:43:23 +02:00
Yury Molodov
72167a697e vmui: ui improvements (#4872)
* vmui: chart refactoring to enhance code structure

* vmui: improve ui
2023-08-25 15:39:21 +02:00
hagen1778
65415b56af docs: make docs-sync
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-25 15:23:30 +02:00
hagen1778
f4577005be docs: update remote-read api docs
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-25 15:21:19 +02:00
Dmytro Kozlov
0b0e0bb50e docs: add migration guide from Promscale (#4865)
Signed-off-by: dmitryk-dk d.kozlov@victoriametrics.com
2023-08-25 13:40:31 +02:00
hagen1778
ffbebfdfe6 docs: typo fix for vmauth
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-25 10:30:11 +02:00
hagen1778
4d316a23ae docs: mention that vmauth doesn't follow redirects
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4868

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-25 10:23:25 +02:00
Aliaksandr Valialkin
f1c2508243 lib/promscrape: add -promscrape.cluster.memberLabel command-line flag
This flag allows specifying an additional label to add to all the scraped metrics.
The flag must contain label name to add. The label value will be equal to -promscrape.cluster.memberNum.

This functionality can help when there is a need to differentiate metrics scraped
by distinct vmagent instances in the cluster according to https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247#issuecomment-1692279393
2023-08-24 22:03:54 +02:00
hagen1778
4ebe8bb1d5 app/vmagent: follow-up after 6788704152
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4884
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-24 11:36:42 +02:00
Zakhar Bessarab
6788704152 lib/promscrape/client: make User-Agent consistent between fasthttp and native client (#4886)
User agent was not set for native client which resulted in using one provided by Golang.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4884

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-24 11:31:13 +02:00
hagen1778
757ae4275b app/vmagent: fix comment typo after 992a1c0a3a
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-24 09:32:11 +02:00
Roman Khavronenko
992a1c0a3a vmagent: retry failed write request on the closed connection (#4857)
* vmagent: retry failed write request on the closed connection

 Retry failed write request on the closed connection immediately,
 without waiting for backoff. This should improve data delivery speed
 and reduce amount of error logs emitted by vmagent when using idle connections.

 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmagent: retry failed write request on the closed connection

Re-instantinate request before retry as body could have been already spoiled.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-08-24 00:08:04 +02:00
Github Actions
6dc66ce35b Automatic update operator docs from VictoriaMetrics/operator@23a4dcc (#4879)
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-08-24 00:05:24 +02:00
Roman Khavronenko
ddf87b32ed vmalert: correctly re-instantinate HTTP req on retries (#4864)
* vmalert: correctly re-instantinate HTTP req on retries

Previosly, request retry to datasource re-used existing HTTP request.
But if request object was already partially processed (body was read),
then retry will be unsuccessful.

The change re-instantinates HTTP request object before retry.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: review fix

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-24 00:04:05 +02:00
Github Actions
2122eb18fe Automatic update operator docs from VictoriaMetrics/operator@8a25794 (#4883) 2023-08-23 19:29:46 +02:00
hagen1778
59dee2e714 docs: mention 1.93.0 contains a bug
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-23 15:55:31 +02:00
hagen1778
df6d27650e marketplace: bump DO to 1.93.1
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-23 15:54:15 +02:00
Nikolay
6abd575cbe docs: release docs 1.93.1 (#4875)
* docs: mention v1.93.1 release

* deployment/docker: bumps image for v1.93.1 release
2023-08-23 15:52:58 +02:00
hagen1778
946e370b26 docs: mention breaking change to indexdb intorduced in 1.92.0
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-23 14:27:55 +02:00
Nikolay
c5aac34b68 lib/storage: properly caclucate nextRotationTimestamp (#4874)
cause of typo unix millis was used instead of unix for current timestamp
calculation
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4873
2023-08-23 13:22:53 +02:00
hagen1778
0df506de54 docs: mention increase as alternative to rate
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-22 09:50:21 +02:00
Zakhar Bessarab
7961479900 docs/vmanomaly: update licensing flags description
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-21 08:11:31 -07:00
Yury Molodov
ca44b8da1f vmui: change warning display for text fields (#4848) (#4863)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4848
2023-08-21 15:42:55 +02:00
Yury Molodov
8287749c05 vmui: chart refactoring to enhance code structure (#4830) 2023-08-21 15:35:47 +02:00
hagen1778
ea2fbcf0e6 vmselect: follow-up after 7349f18c55
7349f18c55
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-21 15:34:21 +02:00
Lapo Luchini
297ef605ef Fix typo in vmauth docs. 2023-08-21 02:55:06 -07:00
Zakhar Bessarab
9169f65521 docs/vmanomaly: clarify offline verification flag description
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-21 02:53:29 -07:00
Zakhar Bessarab
8ea4ae7dbb docs: add vmanomaly docs about monitoring
- add monitoring section
- add reference to monitoring section from licensing monitoring

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-21 02:53:29 -07:00
Zakhar Bessarab
ee747f4f42 docs: add vmanomaly docs about licensing
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-21 02:53:29 -07:00
Tamara Vashchuk
7349f18c55 vmui: Add button to prettify query (#4694)
* Add button to prettify query

Just capitalizes query text for now

* Add /prettify-query API handler

* Replace UI pretiffier using prettifier API

* Add showing server errors

Had to pass setQueryErrors from useFetchQuery.ts

* Use serverUrl from global AppState

* Change icon to AutoAwsome icon + added style change color when button is active

* Add sync/await to prettifyQuery function

* Doc public function for lint

* Minor async fix

* Removed extra blank lines

* Extract usePrettifyQuery hook

* Made more generic style for :active button

* Refactor usePrettifyQuery

However, prettify errors don't clean up query errors, but should

* Add prettyQuery functionality to CHANGELOG.md

* Reuse queryErrors

* Unhide errors on start

---------

Co-authored-by: Tamara <toma.vashchuk@gmail.com>
2023-08-18 20:12:48 +03:00
hagen1778
e9d246f367 docs: exclude assets/README.md
exclude assets/README.md  from publishing on the docs website
as its purpose is different to other docs.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-18 13:24:51 +02:00
Github Actions
7d8b6fbe20 Automatic update operator docs from VictoriaMetrics/operator@76d5956 (#4854) 2023-08-18 09:34:37 +02:00
Dmytro Kozlov
b7d07e5acf lib/protoparser: handle unexpected EOF error when parsing lines in prometheus exposition format (#4851)
Previously only io.EOF was handled, and io.ErrUnexpectedEOF was ignored, but it may happen if the client interrupts the connection.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4817
2023-08-18 08:55:42 +02:00
Aliaksandr Valialkin
1d7eaf0f5b docs/stream-aggregation.md: typo fix after 54f522ac25 2023-08-17 15:24:44 +02:00
Aliaksandr Valialkin
54f522ac25 docs/stream-aggregation.md: clarify the usage of -remoteWrite.label after the fix at a27c2f3773
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247
2023-08-17 15:18:37 +02:00
Aliaksandr Valialkin
9137703729 app/vmagent/remotewrite: follow-up after a27c2f3773
- Fix Prometheus-compatible naming after applying the relabeling if -usePromCompatibleNaming command-line flag is set.
  This should prevent from possible Prometheus-incompatible metric names and label names generated by the relabeling.
- Do not return anything from relabelCtx.appendExtraLabels() function, since it cannot change the number of time series
  passed to it. Append labels for the passed time series in-place.
- Remove promrelabel.FinalizeLabels() call after adding extra labels to time series, since this call has been already
  made at relabelCtx.applyRelabeling(). It is user's responsibility if he passes labels with double underscore prefixes
  to -remoteWrite.label.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247
2023-08-17 14:43:50 +02:00
Aliaksandr Valialkin
f5685f1c54 Revert "try to remove assets folder"
This reverts commit 251d3d54a4.

Reason for revert: this commit removes important information, which was placed in the docs/assets folder
in order to reduce the probability of putting images related to particular docs here,
with the reasoning why this is a bad practice.

This information should remain in the docs/assets folder. Probably, the file should be renamed
from README.md to README or to some other visible name, which doesn't lead to generation of unneded
documentation for the assets folder.
2023-08-17 13:31:27 +02:00
Aliaksandr Valialkin
cd9f86afe1 lib/envflag: do not allow unsupported form for boolean command-line flags in the form -boolFlag value
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4845
2023-08-17 13:26:53 +02:00
Alexander Marshalov
1e1a30ed7f vmagent: fixed premature release of the context (after #4247 / #4824) (#4849)
Follow-up after a27c2f3773

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-08-17 12:15:03 +02:00
Dmytro Kozlov
39623ae428 app/vmctl: fix migration process if tenant have no data (#4799)
app/vmctl: don't interrupt migration process if tenant has no data

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Alexander Marshalov <_@marshalov.org>
2023-08-16 14:54:51 +02:00
Roman Khavronenko
6da32a27ac vmbackup: correctly check if specified -dst belongs to specified -storageDataPath (#4841)
See this issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4837

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-16 14:45:35 +02:00
Aliaksandr Valialkin
f3ef5d16eb Makefile: do not release VictoriaLogs together with VictoriaMetrics
VictoriaLogs has its own release schedule, so it must be released separately via:

  make publish-victoria-logs release-victoria-logs

TODO: sync VictoriaLogs and VictoriaMetrics releases after VictoriaLogs goes out of preview stage.
This will simplify release process and upgrades at user side.
2023-08-15 19:21:10 +02:00
Artem Navoiev
251d3d54a4 try to remove assets folder
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-08-15 15:54:17 +02:00
Alexander Marshalov
a27c2f3773 fixed applying remoteWrite.label for pushed metrics (#4247) (#4824)
vmagent: properly add extra labels before sending data to remote storage

labels from `remoteWrite.label` are now added to sent metrics just before they
 are pushed to `remoteWrite.url` after all relabelings, including stream aggregation relabelings (#4247)

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247

Signed-off-by: Alexander Marshalov <_@marshalov.org>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-08-15 13:47:48 +02:00
Github Actions
c884311cf5 Automatic update operator docs from VictoriaMetrics/operator@9f41076 (#4840) 2023-08-15 10:55:20 +02:00
Arseny
93109842c6 docs/Articles.md: add a link to https://rtfm.co.ua/en/victoriametrics-deploying-a-kubernetes-monitoring-stack/ (#4836) 2023-08-15 09:20:16 +02:00
hagen1778
481a2c70fd dashboard: fix display of ingested rows rate
Fix display of ingested rows rate for `Samples ingested/s`
and `Samples rate` panels for vmagent's dasbhoard.
Previously, not all ingested protocols were accounted in these panels.
An extra panel `Rows rate` was added to `Ingestion` section to display the split
for rows ingested rate by protocol.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-15 08:45:10 +02:00
Artem Navoiev
507879380b fix docs ordering
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-08-14 21:30:02 +02:00
Aliaksandr Valialkin
fdae53a75b lib/promrelabel: properly replace : char with _ in metric names when -usePromCompatibleNaming command-line flag is set
This addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3113#issuecomment-1275077071 comment from @johnseekins
2023-08-14 16:14:42 +02:00
Aliaksandr Valialkin
63e3571e8c lib/promrelabel: stop emitting DEBUG log lines when parsing if expressions
These lines were accidentally left in the commit 62651570bb

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4635
2023-08-14 15:24:31 +02:00
Aliaksandr Valialkin
214be01dfa app/vmselect/netstorage: remove duplicate see word from the error message
This is a follow-up for ac6c40e896

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4827
2023-08-14 02:05:44 -07:00
Aliaksandr Valialkin
ac6c40e896 all: refer to https://docs.victoriametrics.com/#resource-usage-limits in the error message about -search.max* limit
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4827
2023-08-14 01:57:34 -07:00
Aliaksandr Valialkin
a0f695f5de app/vmbackup: add ability to make server-side copying of existing backups 2023-08-13 17:24:24 -07:00
Aliaksandr Valialkin
3d25a82372 docs/Articles.md: add a link to https://www.forbes.com/sites/adrianbridgwater/2023/07/05/the-agility-in-cloud-observability/ 2023-08-12 16:25:15 -07:00
Aliaksandr Valialkin
4fa0725dc9 docs/Articles.md: add a link to https://rtfm.co.ua/en/victoriametrics-an-overview-and-its-use-instead-of-prometheus/ 2023-08-12 16:23:57 -07:00
Aliaksandr Valialkin
88b620b8c8 docs/CHANGELOG.md: document that v1.93.x is a new line of LTS releases 2023-08-12 15:31:57 -07:00
Aliaksandr Valialkin
11329c3d16 docs/CHANGELOG.md: document changes in the v1.87.7 LTS release 2023-08-12 14:49:12 -07:00
Aliaksandr Valialkin
fae59146ad docs/CHANGELOG.md: document LTS release v1.79.14
See https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.79.14
2023-08-12 12:28:10 -07:00
Aliaksandr Valialkin
1a57815769 deployment: update Docker image tags for VictoriaMetrics components from v1.92.1 to v1.93.0
See https://docs.victoriametrics.com/CHANGELOG.html#v1930
2023-08-12 12:25:07 -07:00
Aliaksandr Valialkin
f96804658b app/vmctl: document that -vm-native-step-interval command-line option now supports week value
This is a follow-up for d322ee4b35

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4738
2023-08-12 07:32:58 -07:00
Aliaksandr Valialkin
59f7d810c9 docs/CHANGELOG.md: cut v1.93.0 2023-08-12 06:01:10 -07:00
Aliaksandr Valialkin
e1235267a0 deployment/docker/Makefile: upgrade base Docker image from alpine:3.18.2 to alpine:3.18.3
See https://alpinelinux.org/posts/Alpine-3.15.10-3.16.7-3.17.5-3.18.3-released.html
2023-08-12 05:59:48 -07:00
Aliaksandr Valialkin
05f109ad58 docs/CHANGELOG.md: split changelog into per-year pages in order to keep the size of CHANGELOG pages under control
Make sure that links to particular releases - https://docs.victoriametrics.com/CHANGELOG.html#vXXYY - continue working.
2023-08-12 05:48:43 -07:00
Nikolay
d144e39592 lib/protoparser/openetelemetry: fixes panic (#4821)
Opentelemetry format allows histograms with non-counter buckets. In this case it makes no sense to add buckets into database
and save only counter with _count suffix.
It could be used as gauge.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4814

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-12 05:09:18 -07:00
Nikolay
8faa17493b opentelemetry: return human readable error for json encoding. (#4822)
Opentelemetry parser supports only protobuf atm.

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-12 05:05:16 -07:00
Nikolay
f111ddb862 lib/promscrape: adds validation for proxy_url scheme (#4823)
* lib/promscrape: adds validation for proxy_url scheme
adds tests
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4811

* Update lib/proxy/proxy.go

* Update lib/proxy/proxy.go

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-12 05:03:08 -07:00
Aliaksandr Valialkin
072d891ed9 app/vmselect: prevent from panic when lookbehind window inside rollup function is parsed into negative value
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4795
2023-08-12 04:47:53 -07:00
Aliaksandr Valialkin
d7067c46d0 lib/flagutil: add defaultValue arg to NewArray{Int,Bytes,Duration} functions
The defaultValue is printed in the flag description when passing -help to the app.

This is a follow-up for aef31f201a

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4776
2023-08-12 04:19:05 -07:00
Aliaksandr Valialkin
e8bcb17c8a vendor: run make vendor-update 2023-08-11 07:16:25 -07:00
Aliaksandr Valialkin
acbe327fdf app/{vmselect,vlselect}: run make vmui-update vmui-logs-update after 86f1459ca6 2023-08-11 07:00:39 -07:00
Zakhar Bessarab
b007f78a2e app/vlinsert/elasticsearch: add a command-line flag to provide ES version (#4778)
* app/vlinsert/elasticsearch: add a command-line flag to provide ES version

Adds a flag which will allow to change version which will be reported by ES endpoint for compatibility checks performed by external logs shippers(such as filebeat).
See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4777

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* Document the -elasticsearch.version command-line flag

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4777

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-11 06:52:57 -07:00
Zakhar Bessarab
1bd7637fe1 lib/promrelabel: fix relabeling if clause (#4816)
* lib/promrelabel: fix relabeling if clause being applied to labels outside of current context

Relabeling is applied to each metric row separately, but in order to lower amount of memory allocations it is reusing labels.

Functions which are working on current metric row labels are supposed to use only current metric labels by using provided offset, but if clause matcher was using the whole labels set instead of local metrics.

This leaded to invalid relabeling results such as one described here: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4806

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* docs/CHANGELOG.md: document the bugfix

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1998
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4806

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-08-11 06:37:48 -07:00
Aliaksandr Valialkin
e0017b4d47 all: update Go builder from Go1.20.7 to Go1.21.0
See https://tip.golang.org/doc/go1.21
and https://go.dev/blog/go1.21
2023-08-11 06:25:54 -07:00
Aliaksandr Valialkin
a19a65b3a5 app/vmagent/remotewrite: go fmt 2023-08-11 06:23:00 -07:00
Aliaksandr Valialkin
4c4bcdf0b1 docs/CHANGELOG.md: add a link to stream aggregation for the description of the bugfix at a4a1884237
This makes the description more clear.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4804
2023-08-11 05:38:30 -07:00
Aliaksandr Valialkin
271743f892 docs/CHANGELOG.md: add missing context to the description of the fix at be5c4818f5 2023-08-11 05:26:16 -07:00
Aliaksandr Valialkin
be5c4818f5 lib/httpserver: properly quote the returned address from GetQuotedRemoteAddr() for requests with X-Forwarded-For header
Make sure that the quoted address can be used as JSON string.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4676#issuecomment-1663203424

This is a follow up for 252643d100 and ac0b7e0421

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4676
2023-08-11 05:19:50 -07:00
Aliaksandr Valialkin
ac0b7e0421 Revert "vmui: change the response for active queries (#4782)"
This reverts commit 252643d100.

Reason for revert: the commit incorrectly fixes the the issue.
The `remoteAddr` must be properly quoted inside lib/httpserver.GetQuotedRemoteAddr().
It isn't quoted properly if the request contains X-Forwarded-For header.

The proper fix will be included in the follow-up commit.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4676
2023-08-11 05:06:40 -07:00
Aliaksandr Valialkin
2328e4cabc app/vmagent/remotewrite: keep in sync the default value for -remoteWrite.sendTimeout option in the description with the actually used timeout
This is a follow-up for aef31f201a

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4776
2023-08-11 04:52:00 -07:00
Aliaksandr Valialkin
38e72ea344 docs/Release-Guide.md: document that changes must be synced between branches immediately after the commit in any branch 2023-08-11 04:29:04 -07:00
Aliaksandr Valialkin
7ec4ccc005 docs/CaseStudies.md: update Naver case study after 0c3d61b211
- Fix a link to case study at the top of docs/CaseStudies.md
- Remove non-essential text
- Add Naver video and slides about VictoriaMetrics to docs/Articles.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4755
2023-08-11 04:12:28 -07:00
Aliaksandr Valialkin
b50ed5ddd1 app/vmctl: follow-up after 5aed369132
- Fix default value for --remote-read-disable-path-append
- Clarify description for the change at docs/CHANGELOG.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4655

TODO: address the comment at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4744
2023-08-11 03:46:10 -07:00
Nikolay
6d0163cca9 app/vminsert: adds note for dropSamplesOnOverload flag (#4797)
Adds note for dropSamplesOnOverload flag that are samples dropped before replication
2023-08-11 03:14:45 -07:00
Aliaksandr Valialkin
efb81185a7 docs/CHANGELOG.md: remove superflouos information from the line, which describes the upgrade from Go1.20.6 to Go1.20.7 2023-08-11 03:10:10 -07:00
Aliaksandr Valialkin
e49e4f372b docs/CHANGELOG.md: clarify the change at e3ef3df938
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697
2023-08-11 03:06:41 -07:00
Zakhar Bessarab
f2df8ad480 vmbackupmanager: fixes for windows compatibility (#641)
* app/vmbackupmanager/storage: fix path join for windows

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/backup: fixes for windows support

- close dir before running os.RemoveAll. Windows FS does not allow to delete directory before all handles will be closed.

- add path "normalization" for local FS to use the same format of paths for both *unix and Windows

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-08-11 02:56:11 -07:00
Github Actions
7602b95bcb Automatic update operator docs from VictoriaMetrics/operator@9ea59e2 (#4813)
Co-authored-by: Alexander Marshalov <_@marshalov.org>
2023-08-10 16:36:10 +02:00
Github Actions
6411191158 Automatic update operator docs from VictoriaMetrics/operator@2a03bde (#4812) 2023-08-10 16:33:30 +02:00
hagen1778
c36259fca5 docs: mention honor_timestamps change in changelog
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-10 14:46:39 +02:00
Zakhar Bessarab
a4a1884237 {vmagent/remotewrite,vminsert/common}: fix dropInput and keepInput flags inconsistency (#4809)
{vmagent/remotewrite,vminsert/common}: fix dropInput and keepInput flags inconsistency

Sync behavior for dropInput and keepInput flags between single-node and vmagent.
Fix vmagent not respecting dropInput flag and reverse logic for keepInput.
2023-08-10 14:27:21 +02:00
Yury Molodov
252643d100 vmui: change the response for active queries (#4782)
* fix: change the response to a valid json (#4676)

* vmui/docs: fix response of active queries

https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4676
2023-08-10 12:27:28 +02:00
Yury Molodov
86f1459ca6 vmui: hide "Logs Explorer" for the base build (#4761)
LogsExplorer should be a part of VictoriaLogs binaries, as well as VMUI is now part of VictoriaMetrics binaries.
2023-08-10 11:42:13 +02:00
Yury Molodov
cc7bfaca6c vmui: allow displaying the full error message on click (#4760)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4719
2023-08-10 11:34:25 +02:00
Github Actions
0a93abfeed Automatic update operator docs from VictoriaMetrics/operator@ca27728 (#4802) 2023-08-10 10:38:19 +02:00
Haleygo
bd8ecfb551 docs: add changelog for 4c815ed59b (#4805) 2023-08-10 08:26:55 +02:00
Abirdcfly
835c03fb47 vmalert: fix vmalert_remotewrite_send_duration_seconds_total metric value (#4801)
The deferred call's arguments are evaluated immediately, but the function call is not executed until the surrounding function returns.

Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2023-08-10 10:51:44 +08:00
hagen1778
bc5065fd14 vmalert: mention vmalert_iteration_duration_seconds metric in README
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-08-09 22:03:22 +02:00
Haleygo
262c888d4b vmalert: fix redundant clean up move (#4803)
Follow-up after 55ae2c2d57
2023-08-09 21:16:00 +02:00
2216 changed files with 159007 additions and 53403 deletions

View File

@@ -6,7 +6,7 @@ body:
attributes:
value: |
Before filling a bug report it would be great to [upgrade](https://docs.victoriametrics.com/#how-to-upgrade)
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
and verify whether the bug is reproducible there.
It's also recommended to read the [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html) first.
- type: textarea

View File

@@ -17,7 +17,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.20.7
go-version: 1.21.4
id: go
- name: Code checkout
uses: actions/checkout@master

View File

@@ -33,7 +33,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Initialize CodeQL
uses: github/codeql-action/init@v2

View File

@@ -52,12 +52,12 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: 1.20.7
go-version: 1.21.4
check-latest: true
cache: true
if: ${{ matrix.language == 'go' }}

View File

@@ -27,12 +27,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Code checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: 1.20.7
go-version: 1.21.4
check-latest: true
cache: true
@@ -51,12 +51,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Code checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: 1.20.7
go-version: 1.21.4
check-latest: true
cache: true
@@ -75,13 +75,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Code checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Setup Go
id: go
uses: actions/setup-go@v4
with:
go-version: 1.20.7
go-version: 1.21.4
check-latest: true
cache: true

View File

@@ -6,6 +6,9 @@ on:
paths:
- 'docs/**'
workflow_dispatch: {}
env:
PAGEFIND_VERSION: "1.0.3"
HUGO_VERSION: "latest"
permissions:
contents: read # This is required for actions/checkout and to commit back image update
deployments: write
@@ -15,16 +18,25 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Code checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
path: main
- name: Checkout private code
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: VictoriaMetrics/vmdocs
token: ${{ secrets.VM_BOT_GH_TOKEN }}
path: docs
- uses: peaceiris/actions-hugo@v2
with:
hugo-version: ${{env.HUGO_VERSION}}
extended: true
- name: Install PageFind #install the static search engine for index build
uses: supplypike/setup-bin@v3
with:
uri: "https://github.com/CloudCannon/pagefind/releases/download/v${{env.PAGEFIND_VERSION}}/pagefind-v${{env.PAGEFIND_VERSION}}-x86_64-unknown-linux-musl.tar.gz"
name: "pagefind"
version: ${{env.PAGEFIND_VERSION}}
- name: Import GPG key
uses: crazy-max/ghaction-import-gpg@v5
with:
@@ -45,6 +57,7 @@ jobs:
rm -rf content
cp -r ../main/docs content
make clean-after-copy
make build-search-index
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
git add .

View File

@@ -1,80 +0,0 @@
name: sandbox-release
on:
release:
types: [published]
permissions:
contents: write
jobs:
deploy-sandbox:
runs-on: ubuntu-latest
steps:
- name: check inputs
if: github.event.release.tag_name == ''
run: exit 1
- name: Check out code
uses: actions/checkout@v3
with:
repository: VictoriaMetrics/ops
token: ${{ secrets.VM_BOT_GH_TOKEN }}
- name: Import GPG key
id: import-gpg
uses: crazy-max/ghaction-import-gpg@v5
with:
gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }}
passphrase: ${{ secrets.VM_BOT_PASSPHRASE }}
git_user_signingkey: true
git_commit_gpgsign: true
- name: update image tag
uses: fjogeleit/yaml-update-action@main
with:
valueFile: 'gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml'
commitChange: false
createPR: false
changes: |
{
"gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml": {
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
},
"gcp-test/sandbox/manifests/benchmark-vm/vmsingle.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
},
"gcp-test/sandbox/manifests/monitoring/monitoring-vmagent.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}"
},
"gcp-test/sandbox/manifests/monitoring/monitoring-vmcluster.yaml": {
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
},
"gcp-test/sandbox/manifests/monitoring/vmalert.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
}
}
- name: commit changes
run: |
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
git add .
git commit -S -m "Deploy image tag ${RELEASE_TAG} to sandbox"
env:
RELEASE_TAG: ${{ github.event.release.tag_name }}
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
branch: release-automation
token: ${{ secrets.VM_BOT_GH_TOKEN }}
delete-branch: true
title: "release ${{ github.event.release.tag_name }}"
body: |
Release [${{ github.event.release.tag_name }}](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/${{ github.event.release.tag_name }}) to sandbox
> Auto-generated by `Github Actions Bot`

View File

@@ -16,6 +16,7 @@ GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(DATEINFO_TA
include app/*/Makefile
include deployment/*/Makefile
include dashboards/Makefile
include snap/local/Makefile
include package/release/Makefile
@@ -24,6 +25,7 @@ all: \
victoria-logs-prod \
vmagent-prod \
vmalert-prod \
vmalert-tool-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod \
@@ -32,11 +34,11 @@ all: \
clean:
rm -rf bin/*
publish: package-base \
publish: \
publish-victoria-metrics \
publish-victoria-logs \
publish-vmagent \
publish-vmalert \
publish-vmalert-tool \
publish-vmauth \
publish-vmbackup \
publish-vmrestore \
@@ -47,6 +49,7 @@ package: \
package-victoria-logs \
package-vmagent \
package-vmalert \
package-vmalert-tool \
package-vmauth \
package-vmbackup \
package-vmrestore \
@@ -55,6 +58,7 @@ package: \
vmutils: \
vmagent \
vmalert \
vmalert-tool \
vmauth \
vmbackup \
vmrestore \
@@ -63,6 +67,7 @@ vmutils: \
vmutils-pure: \
vmagent-pure \
vmalert-pure \
vmalert-tool-pure \
vmauth-pure \
vmbackup-pure \
vmrestore-pure \
@@ -71,6 +76,7 @@ vmutils-pure: \
vmutils-linux-amd64: \
vmagent-linux-amd64 \
vmalert-linux-amd64 \
vmalert-tool-linux-amd64 \
vmauth-linux-amd64 \
vmbackup-linux-amd64 \
vmrestore-linux-amd64 \
@@ -79,6 +85,7 @@ vmutils-linux-amd64: \
vmutils-linux-arm64: \
vmagent-linux-arm64 \
vmalert-linux-arm64 \
vmalert-tool-linux-arm64 \
vmauth-linux-arm64 \
vmbackup-linux-arm64 \
vmrestore-linux-arm64 \
@@ -87,6 +94,7 @@ vmutils-linux-arm64: \
vmutils-linux-arm: \
vmagent-linux-arm \
vmalert-linux-arm \
vmalert-tool-linux-arm \
vmauth-linux-arm \
vmbackup-linux-arm \
vmrestore-linux-arm \
@@ -95,6 +103,7 @@ vmutils-linux-arm: \
vmutils-linux-386: \
vmagent-linux-386 \
vmalert-linux-386 \
vmalert-tool-linux-386 \
vmauth-linux-386 \
vmbackup-linux-386 \
vmrestore-linux-386 \
@@ -103,6 +112,7 @@ vmutils-linux-386: \
vmutils-linux-ppc64le: \
vmagent-linux-ppc64le \
vmalert-linux-ppc64le \
vmalert-tool-linux-ppc64le \
vmauth-linux-ppc64le \
vmbackup-linux-ppc64le \
vmrestore-linux-ppc64le \
@@ -111,6 +121,7 @@ vmutils-linux-ppc64le: \
vmutils-darwin-amd64: \
vmagent-darwin-amd64 \
vmalert-darwin-amd64 \
vmalert-tool-darwin-amd64 \
vmauth-darwin-amd64 \
vmbackup-darwin-amd64 \
vmrestore-darwin-amd64 \
@@ -119,6 +130,7 @@ vmutils-darwin-amd64: \
vmutils-darwin-arm64: \
vmagent-darwin-arm64 \
vmalert-darwin-arm64 \
vmalert-tool-darwin-arm64 \
vmauth-darwin-arm64 \
vmbackup-darwin-arm64 \
vmrestore-darwin-arm64 \
@@ -127,6 +139,7 @@ vmutils-darwin-arm64: \
vmutils-freebsd-amd64: \
vmagent-freebsd-amd64 \
vmalert-freebsd-amd64 \
vmalert-tool-freebsd-amd64 \
vmauth-freebsd-amd64 \
vmbackup-freebsd-amd64 \
vmrestore-freebsd-amd64 \
@@ -135,6 +148,7 @@ vmutils-freebsd-amd64: \
vmutils-openbsd-amd64: \
vmagent-openbsd-amd64 \
vmalert-openbsd-amd64 \
vmalert-tool-openbsd-amd64 \
vmauth-openbsd-amd64 \
vmbackup-openbsd-amd64 \
vmrestore-openbsd-amd64 \
@@ -143,6 +157,7 @@ vmutils-openbsd-amd64: \
vmutils-windows-amd64: \
vmagent-windows-amd64 \
vmalert-windows-amd64 \
vmalert-tool-windows-amd64 \
vmauth-windows-amd64 \
vmbackup-windows-amd64 \
vmrestore-windows-amd64 \
@@ -174,6 +189,7 @@ vmutils-crossbuild: \
vmutils-windows-amd64
publish-release:
rm -rf bin/*
git checkout $(TAG) && LATEST_TAG=stable $(MAKE) release publish && \
git checkout $(TAG)-cluster && LATEST_TAG=cluster-stable $(MAKE) release publish && \
git checkout $(TAG)-enterprise && LATEST_TAG=enterprise-stable $(MAKE) release publish && \
@@ -181,7 +197,6 @@ publish-release:
release: \
release-victoria-metrics \
release-victoria-logs \
release-vmutils
release-victoria-metrics: \
@@ -339,6 +354,7 @@ release-vmutils-windows-amd64:
release-vmutils-goos-goarch: \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
@@ -347,6 +363,7 @@ release-vmutils-goos-goarch: \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
@@ -354,6 +371,7 @@ release-vmutils-goos-goarch: \
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
@@ -362,6 +380,7 @@ release-vmutils-goos-goarch: \
cd bin && rm -rf \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
@@ -370,6 +389,7 @@ release-vmutils-goos-goarch: \
release-vmutils-windows-goarch: \
vmagent-windows-$(GOARCH)-prod \
vmalert-windows-$(GOARCH)-prod \
vmalert-tool-windows-$(GOARCH)-prod \
vmauth-windows-$(GOARCH)-prod \
vmbackup-windows-$(GOARCH)-prod \
vmrestore-windows-$(GOARCH)-prod \
@@ -378,6 +398,7 @@ release-vmutils-windows-goarch: \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
@@ -385,6 +406,7 @@ release-vmutils-windows-goarch: \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
@@ -393,6 +415,7 @@ release-vmutils-windows-goarch: \
cd bin && rm -rf \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
@@ -437,7 +460,7 @@ benchmark-pure:
vendor-update:
go get -u -d ./lib/...
go get -u -d ./app/...
go mod tidy -compat=1.19
go mod tidy -compat=1.20
go mod vendor
app-local:
@@ -463,7 +486,7 @@ golangci-lint: install-golangci-lint
golangci-lint run
install-golangci-lint:
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.51.2
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.55.1
govulncheck: install-govulncheck
govulncheck ./...
@@ -514,3 +537,4 @@ docs-sync:
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs

467
README.md
View File

@@ -13,7 +13,7 @@
VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
[Docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/), [Snap packages](https://snapcraft.io/victoriametrics)
and [source code](https://github.com/VictoriaMetrics/VictoriaMetrics).
@@ -29,7 +29,8 @@ If you have questions about VictoriaMetrics, then feel free asking them at [Vict
[Contact us](mailto:info@victoriametrics.com) if you need enterprise support for VictoriaMetrics.
See [features available in enterprise package](https://docs.victoriametrics.com/enterprise.html).
Enterprise binaries can be downloaded and evaluated for free
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking the [CHANGELOG](https://docs.victoriametrics.com/CHANGELOG.html) and performing [regular upgrades](#how-to-upgrade-victoriametrics).
@@ -86,6 +87,7 @@ VictoriaMetrics has the following prominent features:
* [Arbitrary CSV data](#how-to-import-csv-data).
* [Native binary format](#how-to-import-data-in-native-format).
* [DataDog agent or DogStatsD](#how-to-send-data-from-datadog-agent).
* [NewRelic infrastructure agent](#how-to-send-data-from-newrelic-agent).
* [OpenTelemetry metrics format](#sending-data-via-opentelemetry).
* It supports powerful [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html), which can be used as a [statsd](https://github.com/statsd/statsd) alternative.
* It supports metrics [relabeling](#relabeling).
@@ -110,6 +112,7 @@ Case studies:
* [Brandwatch](https://docs.victoriametrics.com/CaseStudies.html#brandwatch)
* [CERN](https://docs.victoriametrics.com/CaseStudies.html#cern)
* [COLOPL](https://docs.victoriametrics.com/CaseStudies.html#colopl)
* [Criteo](https://docs.victoriametrics.com/CaseStudies.html#criteo)
* [Dig Security](https://docs.victoriametrics.com/CaseStudies.html#dig-security)
* [Fly.io](https://docs.victoriametrics.com/CaseStudies.html#flyio)
* [German Research Center for Artificial Intelligence](https://docs.victoriametrics.com/CaseStudies.html#german-research-center-for-artificial-intelligence)
@@ -135,7 +138,8 @@ See also [articles and slides about VictoriaMetrics from our users](https://docs
### Install
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
See also [QuickStart guide](https://docs.victoriametrics.com/Quick-Start.html) for additional information.
VictoriaMetrics can also be installed via these installation methods:
@@ -152,7 +156,7 @@ VictoriaMetrics can also be installed via these installation methods:
The following command-line flags are used the most:
* `-storageDataPath` - VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in the current working directory.
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month. The minimum retention period is 24h or 1d. See [the Retention section](#retention) for more details.
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month (31 days). The minimum retention period is 24h or 1d. See [these docs](#retention) for more details.
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see [all the available flags with description and default values](#list-of-command-line-flags).
@@ -173,7 +177,8 @@ VictoriaMetrics is developed at a fast pace, so it is recommended periodically c
### Environment variables
All the VictoriaMetrics components allow referring environment variables in command-line flags via `%{ENV_VAR}` syntax.
All the VictoriaMetrics components allow referring environment variables in `yaml` configuration files (such as `-promscrape.config`)
and in command-line flags via `%{ENV_VAR}` syntax.
For example, `-metricsAuthKey=%{METRICS_AUTH_KEY}` is automatically expanded to `-metricsAuthKey=top-secret`
if `METRICS_AUTH_KEY=top-secret` environment variable exists at VictoriaMetrics startup.
This expansion is performed by VictoriaMetrics itself.
@@ -209,6 +214,61 @@ vi $SNAP_DATA/var/snap/victoriametrics/current/etc/victoriametrics-scrape-config
After changes were made, trigger config re-read with the command `curl 127.0.0.1:8428/-/reload`.
### Running as Windows service
In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw)
and then install it as a service according to the following guide:
1. Create a service configuration:
```xml
<service>
<id>VictoriaMetrics</id>
<name>VictoriaMetrics</name>
<description>VictoriaMetrics</description>
<executable>%BASE%\victoria-metrics-windows-amd64-prod.exe"</executable>
<onfailure action="restart" delay="10 sec"/>
<onfailure action="restart" delay="20 sec"/>
<resetfailure>1 hour</resetfailure>
<arguments>-envflag.enable</arguments>
<priority>Normal</priority>
<stoptimeout>15 sec</stoptimeout>
<stopparentprocessfirst>true</stopparentprocessfirst>
<startmode>Automatic</startmode>
<waithint>15 sec</waithint>
<sleeptime>1 sec</sleeptime>
<logpath>%BASE%\logs</logpath>
<log mode="roll">
<sizeThreshold>10240</sizeThreshold>
<keepFiles>8</keepFiles>
</log>
<env name="loggerFormat" value="json" />
<env name="loggerOutput" value="stderr" />
<env name="promscrape_config" value="C:\Program Files\victoria-metrics\promscrape.yml" />
</service>
```
1. Install WinSW by following this [documentation](https://github.com/winsw/winsw#download).
1. Install VictoriaMetrics as a service by running the following from elevated PowerShell:
```console
winsw install VictoriaMetrics.xml
Get-Service VictoriaMetrics | Start-Service
```
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781) for more details.
## Prometheus setup
Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`) in order to send data to VictoriaMetrics:
@@ -270,7 +330,7 @@ too high memory consumption of Prometheus, then try to lower `max_samples_per_se
Keep in mind that these two params are tightly connected.
Read more about tuning remote write for Prometheus [here](https://prometheus.io/docs/practices/remote_write).
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer,
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases/latest) or newer,
since previous versions may have issues with `remote_write`.
Take a look also at [vmagent](https://docs.victoriametrics.com/vmagent.html)
@@ -279,7 +339,8 @@ which can be used as faster and less resource-hungry alternative to Prometheus.
## Grafana setup
Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following url:
Create [Prometheus datasource](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/)
in Grafana with the following url:
```url
http://<victoriametrics-addr>:8428
@@ -293,13 +354,18 @@ or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
Alternatively, use VictoriaMetrics [datasource plugin](https://github.com/VictoriaMetrics/grafana-datasource) with support of extra features.
See more in [description](https://github.com/VictoriaMetrics/grafana-datasource#victoriametrics-data-source-for-grafana).
Creating a datasource may require [specific permissions](https://grafana.com/docs/grafana/latest/administration/data-source-management/).
If you don't see an option to create a data source - try contacting system administrator.
## How to upgrade VictoriaMetrics
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking [the CHANGELOG page](https://docs.victoriametrics.com/CHANGELOG.html) and performing regular upgrades.
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise.
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
The following steps must be performed during the upgrade / downgrade procedure:
@@ -363,6 +429,8 @@ See the [example VMUI at VictoriaMetrics playground](https://play.victoriametric
* queries with the biggest average execution duration;
* queries that took the most summary time for execution.
This information is obtained from the `/api/v1/status/top_queries` HTTP endpoint.
## Active queries
[VMUI](#vmui) provides `active queries` tab, which shows currently execute queries.
@@ -372,12 +440,14 @@ It provides the following information per each query:
- The duration of the query execution.
- The client address, who initiated the query execution.
This information is obtained from the `/api/v1/status/active_queries` HTTP endpoint.
## Metrics explorer
[VMUI](#vmui) provides an ability to explore metrics exported by a particular `job` / `instance` in the following way:
1. Open the `vmui` at `http://victoriametrics:8428/vmui/`.
1. Click the `Explore metrics` tab.
1. Click the `Explore Prometheus metrics` tab.
1. Select the `job` you want to explore.
1. Optionally select the `instance` for the selected job to explore.
1. Select metrics you want to explore and compare.
@@ -403,14 +473,16 @@ matching the specified [series selector](https://prometheus.io/docs/prometheus/l
Cardinality explorer is built on top of [/api/v1/status/tsdb](#tsdb-stats).
See [cardinality explorer playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/cardinality).
See the example of using the cardinality explorer [here](https://victoriametrics.com/blog/cardinality-explorer/).
## Cardinality explorer statistic inaccuracy
In [cluster version of VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) each vmstorage tracks the stored time series individually.
vmselect requests stats via [/api/v1/status/tsdb](#tsdb-stats) API from each vmstorage node and merges the results by summing per-series stats.
This may lead to inflated values when samples for the same time series are spread across multiple vmstorage nodes
due to [replication](#replication) or [rerouting](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html?highlight=re-routes#cluster-availability).
See [cardinality explorer playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/cardinality).
See the example of using the cardinality explorer [here](https://victoriametrics.com/blog/cardinality-explorer/).
## How to apply new config to VictoriaMetrics
VictoriaMetrics is configured via command-line flags, so it must be restarted when new command-line flags should be applied:
@@ -472,7 +544,7 @@ dd_url: http://victoriametrics:8428/datadog
</div>
vmagent also can accept Datadog metrics format. Depending on where vmagent will forward data,
pick [single-node or cluster URL]((https://docs.victoriametrics.com/url-examples.html#datadog)) formats.
pick [single-node or cluster URL](https://docs.victoriametrics.com/url-examples.html#datadog) formats.
### Sending metrics to Datadog and VictoriaMetrics
@@ -615,6 +687,28 @@ Some plugins for Telegraf such as [fluentd](https://github.com/fangli/fluent-plu
or [Juniper/jitmon](https://github.com/Juniper/jtimon) send `SHOW DATABASES` query to `/query` and expect a particular database name in the response.
Comma-separated list of expected databases can be passed to VictoriaMetrics via `-influx.databaseNames` command-line flag.
### How to send data in InfluxDB v2 format
VictoriaMetrics exposes endpoint for InfluxDB v2 HTTP API at `/influx/api/v2/write` and `/api/v2/write`.
In order to write data with InfluxDB line protocol to local VictoriaMetrics using `curl`:
<div class="with-copy" markdown="1">
```console
curl -d 'measurement,tag1=value1,tag2=value2 field1=123,field2=1.23' -X POST 'http://localhost:8428/api/v2/write'
```
</div>
The `/api/v1/export` endpoint should return the following response:
```json
{"metric":{"__name__":"measurement_field1","tag1":"value1","tag2":"value2"},"values":[123],"timestamps":[1695902762311]}
{"metric":{"__name__":"measurement_field2","tag1":"value1","tag2":"value2"},"values":[1.23],"timestamps":[1695902762311]}
```
## How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)
Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
@@ -765,6 +859,79 @@ The `/api/v1/export` endpoint should return the following response:
Extra labels may be added to all the imported time series by passing `extra_label=name=value` query args.
For example, `/api/put?extra_label=foo=bar` would add `{foo="bar"}` label to all the ingested metrics.
## How to send data from NewRelic agent
VictoriaMetrics accepts data from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent)
at `/newrelic/infra/v2/metrics/events/bulk` HTTP path.
VictoriaMetrics receives [Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
from NewRelic agent at the given path, transforms them to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
according to [these docs](#newrelic-agent-data-mapping) before storing the raw samples to the database.
You need passing `COLLECTOR_URL` and `NRIA_LICENSE_KEY` environment variables to NewRelic infrastructure agent in order to send the collected metrics to VictoriaMetrics.
The `COLLECTOR_URL` must point to `/newrelic` HTTP endpoint at VictoriaMetrics, while the `NRIA_LICENSE_KEY` must contain NewRelic license key,
which can be obtained [here](https://newrelic.com/signup).
For example, if VictoriaMetrics runs at `localhost:8428`, then the following command can be used for running NewRelic infrastructure agent:
```console
COLLECTOR_URL="http://localhost:8428/newrelic" NRIA_LICENSE_KEY="NEWRELIC_LICENSE_KEY" ./newrelic-infra
```
### NewRelic agent data mapping
VictoriaMetrics maps [NewRelic Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) in the following way:
1. Every numeric field is converted into a raw sample with the corresponding name.
1. The `eventType` and all the other fields with `string` value type are attached to every raw sample as [metric labels](https://docs.victoriametrics.com/keyConcepts.html#labels).
1. The `timestamp` field is used as timestamp for the ingested [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples).
The `timestamp` field may be specified either in seconds or in milliseconds since the [Unix Epoch](https://en.wikipedia.org/wiki/Unix_time).
If the `timestamp` field is missing, then the raw sample is stored with the current timestamp.
For example, let's import the following NewRelic Events request to VictoriaMetrics:
```json
[
{
"Events":[
{
"eventType":"SystemSample",
"entityKey":"macbook-pro.local",
"cpuPercent":25.056660790748904,
"cpuUserPercent":8.687987912389374,
"cpuSystemPercent":16.36867287835953,
"cpuIOWaitPercent":0,
"cpuIdlePercent":74.94333920925109,
"cpuStealPercent":0,
"loadAverageOneMinute":5.42333984375,
"loadAverageFiveMinute":4.099609375,
"loadAverageFifteenMinute":3.58203125
}
]
}
]
```
Save this JSON into `newrelic.json` file and then use the following command in order to import it into VictoriaMetrics:
```console
curl -X POST -H 'Content-Type: application/json' --data-binary @newrelic.json http://localhost:8428/newrelic/infra/v2/metrics/events/bulk
```
Let's fetch the ingested data via [data export API](#how-to-export-data-in-json-line-format):
```console
curl http://localhost:8428/api/v1/export -d 'match={eventType="SystemSample"}'
{"metric":{"__name__":"cpuStealPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageFiveMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[4.099609375],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuIOWaitPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuSystemPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[16.368672878359],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageOneMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[5.42333984375],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuUserPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[8.687987912389],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuIdlePercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[74.9433392092],"timestamps":[1697407970000]}
{"metric":{"__name__":"loadAverageFifteenMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[3.58203125],"timestamps":[1697407970000]}
{"metric":{"__name__":"cpuPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[25.056660790748],"timestamps":[1697407970000]}
```
## Prometheus querying API usage
VictoriaMetrics supports the following handlers from [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/):
@@ -829,7 +996,7 @@ Additionally, VictoriaMetrics provides the following handlers:
* `/api/v1/series/count` - returns the total number of time series in the database. Some notes:
* the handler scans all the inverted index, so it can be slow if the database contains tens of millions of time series;
* the handler may count [deleted time series](#how-to-delete-time-series) additionally to normal time series due to internal implementation restrictions;
* `/api/v1/status/active_queries` - returns a list of currently running queries.
* `/api/v1/status/active_queries` - returns the list of currently running queries. This list is also available at [`active queries` page at VMUI](#active-queries).
* `/api/v1/status/top_queries` - returns the following query lists:
* the most frequently executed queries - `topByCount`
* queries with the biggest average execution duration - `topByAvgDuration`
@@ -839,6 +1006,8 @@ Additionally, VictoriaMetrics provides the following handlers:
For example, request to `/api/v1/status/top_queries?topN=5&maxLifetime=30s` would return up to 5 queries per list, which were executed during the last 30 seconds.
VictoriaMetrics tracks the last `-search.queryStats.lastQueriesCount` queries with durations at least `-search.queryStats.minQueryDuration`.
See also [`top queries` page at VMUI](#top-queries).
### Timestamp formats
VictoriaMetrics accepts the following formats for `time`, `start` and `end` query args
@@ -907,14 +1076,14 @@ VictoriaMetrics supports the following handlers from [Graphite Tags API](https:/
## How to build from sources
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) or
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) instead of building VictoriaMetrics
from sources. Building from sources is reasonable when developing additional features specific
to your needs or when testing bugfixes.
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make victoria-metrics` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics` binary and puts it into the `bin` folder.
@@ -930,7 +1099,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make victoria-metrics-linux-arm` or `make victoria-metrics-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-linux-arm` or `victoria-metrics-linux-arm64` binary respectively and puts it into the `bin` folder.
@@ -944,7 +1113,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
`Pure Go` mode builds only Go code without [cgo](https://golang.org/cmd/cgo/) dependencies.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make victoria-metrics-pure` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-pure` binary and puts it into the `bin` folder.
@@ -962,6 +1131,18 @@ For example, the following command builds the image on top of [scratch](https://
ROOT_IMAGE=scratch make package-victoria-metrics
```
#### Building VictoriaMetrics with Podman
VictoriaMetrics can be built with Podman in either rootful or rootless mode.
When building via rootlful Podman, simply add `DOCKER=podman` to the relevant `make` commandline. To build
via rootless Podman, add `DOCKER=podman DOCKER_RUN="podman run --userns=keep-id"` to the `make`
commandline.
For example: `make victoria-metrics-pure DOCKER=podman DOCKER_RUN="podman run --userns=keep-id"`
Note that `production` builds are not supported via Podman becuase Podman does not support `buildx`.
## Start with docker-compose
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
@@ -988,6 +1169,15 @@ Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-s
is the command-line flag value. Snapshots can be archived to backup storage at any time
with [vmbackup](https://docs.victoriametrics.com/vmbackup.html).
Snapshots consist of a mix of hard-links and soft-links to various files and directories inside `-storageDataPath`.
See [this article](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
for more details. This adds some restrictions on what can be done with the contents of `<-storageDataPath>/snapshots` directory:
- Do not delete subdirectories inside `<-storageDataPath>/snapshots` with `rm` or similar commands, since this will leave some snapshot data undeleted.
Prefer using the `/snapshot/delete` API for deleting snapshot. See below for more details about this API.
- Do not copy subdirectories inside `<-storageDataPath>/snapshot` with `cp`, `rsync` or similar commands, since there are high chances
that these commands won't copy some data stored in the snapshot. Prefer using [vmbackup](https://docs.victoriametrics.com/vmbackup.html) for making copies of snapshot data.
The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete?snapshot=<snapshot-name>` in order
@@ -1065,7 +1255,9 @@ VictoriaMetrics provides the following handlers for exporting data:
Send a request to `http://<victoriametrics-addr>:8428/api/v1/export?match[]=<timeseries_selector_for_export>`,
where `<timeseries_selector_for_export>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
for metrics to export. Use `{__name__!=""}` selector for fetching all the time series.
The response would contain all the data for the selected time series in [JSON streaming format](http://ndjson.org/).
The response would contain all the data for the selected time series in JSON line format - see [these docs](#json-line-format) for details on this format.
Each JSON line contains samples for a single time series. An example output:
```json
@@ -1195,6 +1387,8 @@ check for changes in `vm_rows_invalid_total` (exported by server side) metric.
### How to import data in JSON line format
VictoriaMetrics accepts metrics data in JSON line format at `/api/v1/import` endpoint. See [these docs](#json-line-format) for details on this format.
Example for importing data obtained via [/api/v1/export](#how-to-export-data-in-json-line-format):
```console
@@ -1359,13 +1553,54 @@ Note that it could be required to flush response cache after importing historica
VictoriaMetrics also may scrape Prometheus targets - see [these docs](#how-to-scrape-prometheus-exporters-such-as-node-exporter).
## Sending data via OpenTelemetry
### Sending data via OpenTelemetry
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentelemetry/api/v1/push` path.
VictoriaMetrics expects `protobuf`-encoded requests at `/opentelemetry/api/v1/push`.
Set HTTP request header `Content-Encoding: gzip` when sending gzip-compressed data to `/opentelemetry/api/v1/push`.
## JSON line format
VictoriaMetrics accepts data in JSON line format at [/api/v1/import](#how-to-import-data-in-json-line-format)
and exports data in this format at [/api/v1/export](#how-to-export-data-in-json-line-format).
The format follows [JSON streaming concept](http://ndjson.org/), e.g. each line contains JSON object with metrics data in the following format:
```
{
// metric contans metric name plus labels for a particular time series
"metric":{
"__name__": "metric_name", // <- this is metric name
// Other labels for the time series
"label1": "value1",
"label2": "value2",
...
"labelN": "valueN"
},
// values contains raw sample values for the given time series
"values": [1, 2.345, -678],
// timestamps contains raw sample UNIX timestamps in milliseconds for the given time series
// every timestamp is associated with the value at the corresponding position
"timestamps": [1549891472010,1549891487724,1549891503438]
}
```
Note that every JSON object must be written in a single line, e.g. all the newline chars must be removed from it.
Every line length is limited by the value passed to `-import.maxLineLen` command-line flag (by default this is 100MB).
It is recommended passing 1K-10K samples per line for achieving the maximum data ingestion performance at [/api/v1/import](#how-to-import-data-in-json-line-format).
Too long JSON lines may increase RAM usage at VictoriaMetrics side.
It is OK to split [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
for the same [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) across multiple lines.
The number of lines in JSON line document can be arbitrary.
## Relabeling
VictoriaMetrics supports Prometheus-compatible relabeling for all the ingested metrics if `-relabelConfig` command-line flag points
@@ -1465,43 +1700,44 @@ See also [cardinality limiter](#cardinality-limiter) and [capacity planning docs
## High availability
* Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
* Pass addresses of these instances to [vmagent](https://docs.victoriametrics.com/vmagent.html) via `-remoteWrite.url` command-line flag:
The general approach for achieving high availability is the following:
- to run two identically configured VictoriaMetrics instances in distinct datacenters (availability zones)
- to store the collected data simultaneously into these instances via [vmagent](https://docs.victoriametrics.com/vmagent.html) or Prometheus
- to query the first VictoriaMetrics instance and to fail over to the second instance when the first instance becomes temporarily unavailable.
Such a setup guarantees that the collected data isn't lost when one of VictoriaMetrics instance becomes unavailable.
The collected data continues to be written to the available VictoriaMetrics instance, so it should be available for querying.
Both [vmagent](https://docs.victoriametrics.com/vmagent.html) and Prometheus buffer the collected data locally if they cannot send it
to the configured remote storage. So the collected data will be written to the temporarily unavailable VictoriaMetrics instance
after it becomes available.
If you use [vmagent](https://docs.victoriametrics.com/vmagent.html) for storing the data into VictoriaMetrics,
then it can be configured with multiple `-remoteWrite.url` command-line flags, where every flag points to the VictoriaMetrics
instance in a particular availability zone, in order to replicate the collected data to all the VictoriaMetrics instances.
For example, the following command instructs `vmagent` to replicate data to `vm-az1` and `vm-az2` instances of VictoriaMetrics:
```console
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
/path/to/vmagent \
-remoteWrite.url=http://<vm-az1>:8428/api/v1/write \
-remoteWrite.url=http://<vm-az2>:8428/api/v1/write
```
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
If you use Prometheus for collecting and writing the data to VictoriaMetrics,
then the following [`remote_write`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section
in Prometheus config can be used for replicating the collected data to `vm-az1` and `vm-az2` VictoriaMetrics instances:
```yml
remote_write:
- url: http://<victoriametrics-addr-1>:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
# ...
- url: http://<victoriametrics-addr-N>:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
- url: http://<vm-az1>:8428/api/v1/write
- url: http://<vm-az2>:8428/api/v1/write
```
* Apply the updated config:
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups,
since it uses lower amounts of RAM, CPU and network bandwidth than Prometheus.
```console
kill -HUP `pidof prometheus`
```
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups.
* Now Prometheus should write data into all the configured `remote_write` urls in parallel.
* Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
* Set up Prometheus datasource in Grafana that points to Promxy.
If you have Prometheus HA pairs with replicas `r1` and `r2` in each pair, then configure each `r1`
to write data to `victoriametrics-addr-1`, while each `r2` should write data to `victoriametrics-addr-2`.
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
with the enabled de-duplication. See [this section](#deduplication) for details.
If you use identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) instances for collecting the same data
and sending it to VictoriaMetrics, then do not forget enabling [deduplication](#deduplication) at VictoriaMetrics side.
## Deduplication
@@ -1609,8 +1845,8 @@ See also [how to work with snapshots](#how-to-work-with-snapshots).
## Retention
Retention is configured with the `-retentionPeriod` command-line flag, which takes a number followed by a time unit
character - `h(ours)`, `d(ays)`, `w(eeks)`, `y(ears)`. If the time unit is not specified, a month is assumed.
For instance, `-retentionPeriod=3` means that the data will be stored for 3 months and then deleted.
character - `h(ours)`, `d(ays)`, `w(eeks)`, `y(ears)`. If the time unit is not specified, a month (31 days) is assumed.
For instance, `-retentionPeriod=3` means that the data will be stored for 3 months (93 days) and then deleted.
The default retention period is one month. The **minimum retention** period is 24h or 1d.
Data is split in per-month partitions inside `<-storageDataPath>/data/{small,big}` folders.
@@ -1653,9 +1889,10 @@ See [these docs](https://docs.victoriametrics.com/guides/guide-vmcluster-multipl
which allow configuring multiple retentions for distinct sets of time series matching the configured [series filters](https://docs.victoriametrics.com/keyConcepts.html#filtering)
via `-retentionFilter` command-line flag. This flag accepts `filter:duration` options, where `filter` must be
a valid [series filter](https://docs.victoriametrics.com/keyConcepts.html#filtering), while the `duration`
must contain valid [retention](#retention) for time series matching the given `filter`. If series doesn't match
any configured `-retentionFilter`, then the retention configured via [-retentionPeriod](#retention) command-line flag is applied to it.
If series matches multiple configured retention filters, then the smallest retention is applied.
must contain valid [retention](#retention) for time series matching the given `filter`.
The `duration` of the `-retentionFilter` must be lower or equal to [-retentionPeriod](#retention) flag value.
If series doesn't match any configured `-retentionFilter`, then the retention configured via [-retentionPeriod](#retention)
command-line flag is applied to it. If series matches multiple configured retention filters, then the smallest retention is applied.
For example, the following config sets 3 days retention for time series with `team="juniors"` label,
30 days retention for time series with `env="dev"` or `env="staging"` label and 1 year retention for the remaining time series:
@@ -1676,7 +1913,8 @@ to historical data.
See [how to configure multiple retentions in VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#retention-filters).
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## Downsampling
@@ -1686,14 +1924,34 @@ Retention filters can be evaluated for free by downloading and using enterprise
* `-downsampling.period=30d:5m,180d:1h` instructs VictoriaMetrics to deduplicate samples older than 30 days with 5 minutes interval and to deduplicate samples older than 180 days with 1 hour interval.
Downsampling is applied independently per each time series. It can reduce disk space usage and improve query performance if it is applied to time series with big number of samples per each series. The downsampling doesn't improve query performance if the database contains big number of time series with small number of samples per each series (aka [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)), since downsampling doesn't reduce the number of time series. So the majority of time is spent on searching for the matching time series.
It is possible to use [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html) in vmagent or recording rules in [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to [reduce the number of time series](https://docs.victoriametrics.com/vmalert.html#downsampling-and-aggregation-via-vmalert).
Downsampling is applied independently per each time series and leaves a single [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
with the biggest [timestamp](https://en.wikipedia.org/wiki/Unix_time) on the configured interval, in the same way as [deduplication](#deduplication) does.
It works the best for [counters](https://docs.victoriametrics.com/keyConcepts.html#counter) and [histograms](https://docs.victoriametrics.com/keyConcepts.html#histogram),
as their values are always increasing. But downsampling [gauges](https://docs.victoriametrics.com/keyConcepts.html#gauge)
and [summaries](https://docs.victoriametrics.com/keyConcepts.html#summary)
would mean losing the changes within the downsampling interval. Please note, you can use [recording rules](https://docs.victoriametrics.com/vmalert.html#rules)
or [steaming aggregation](https://docs.victoriametrics.com/stream-aggregation.html)
to apply custom aggregation functions, like min/max/avg etc., in order to make gauges more resilient to downsampling.
Downsampling can reduce disk space usage and improve query performance if it is applied to time series with big number
of samples per each series. The downsampling doesn't improve query performance if the database contains big number
of time series with small number of samples per each series (aka [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)),
since downsampling doesn't reduce the number of time series. In this case the majority of query time is spent on searching for the matching time series
instead of processing the found samples.
It is possible to use [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html) in [vmagent](https://docs.victoriametrics.com/vmagent.html)
or recording rules in [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to
[reduce the number of time series](https://docs.victoriametrics.com/vmalert.html#downsampling-and-aggregation-via-vmalert).
Downsampling happens during [background merges](https://docs.victoriametrics.com/#storage)
and can't be performed if there is not enough of free disk space or if vmstorage
is in [read-only mode](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#readonly-mode).
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
Please, note that intervals of `-downsampling.period` must be multiples of each other.
In case [deduplication](https://docs.victoriametrics.com/#deduplication) is enabled value of `-dedup.minScrapeInterval` must also be multiple of `-downsampling.period` intervals.
This is required to ensure consistency of deduplication and downsampling results.
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## Multi-tenancy
@@ -1740,6 +1998,8 @@ VictoriaMetrics provides the following security-related command-line flags:
* `-flagsAuthKey` for protecting `/flags` endpoint.
* `-pprofAuthKey` for protecting `/debug/pprof/*` endpoints, which can be used for [profiling](#profiling).
* `-denyQueryTracing` for disallowing [query tracing](#query-tracing).
* `-http.header.hsts`, `-http.header.csp`, and `-http.header.frameOptions` for serving `Strict-Transport-Security`, `Content-Security-Policy`
and `X-Frame-Options` HTTP response headers.
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`. This protects from unexpected requests from untrusted network interfaces.
@@ -1756,7 +2016,7 @@ and [the general security page at VictoriaMetrics website](https://victoriametri
The only option is increasing the limit on [the number of open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a).
The recommendation is not specific for VictoriaMetrics only but also for any service which handles many HTTP connections and stores data on disk.
* VictoriaMetrics is a write-heavy application and its performance depends on disk performance. So be careful with other
applications or utilities (like [fstrim](http://manpages.ubuntu.com/manpages/bionic/man8/fstrim.8.html))
applications or utilities (like [fstrim](https://manpages.ubuntu.com/manpages/lunar/en/man8/fstrim.8.html))
which could [exhaust disk resources](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1521).
* The recommended filesystem is `ext4`, the recommended persistent storage is [persistent HDD-based disk on GCP](https://cloud.google.com/compute/docs/disks/#pdspecs),
since it is protected from hardware failures via internal replication and it can be [resized on the fly](https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd).
@@ -1785,9 +2045,9 @@ Graphs on the dashboards contain useful hints - hover the `i` icon in the top le
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
via [vmalert](https://docs.victoriametrics.com/vmalert.html) or via Prometheus.
VictoriaMetrics exposes currently running queries and their execution times at `/api/v1/status/active_queries` page.
VictoriaMetrics exposes currently running queries and their execution times at [`active queries` page](#active-queries).
VictoriaMetrics exposes queries, which take the most time to execute, at `/api/v1/status/top_queries` page.
VictoriaMetrics exposes queries, which take the most time to execute, at [`top queries` page](#top-queries).
See also [VictoriaMetrics Monitoring](https://victoriametrics.com/blog/victoriametrics-monitoring/)
and [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html).
@@ -1922,7 +2182,7 @@ and [cardinality explorer docs](#cardinality-explorer).
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
since the encountered issue could be already fixed there.
* It is recommended to have at least 50% of spare resources for CPU, disk IO and RAM, so VictoriaMetrics could handle short spikes in the workload without performance issues.
@@ -1932,9 +2192,11 @@ and [cardinality explorer docs](#cardinality-explorer).
has at least 20% of free space. The remaining amount of free space
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
which would start background merge if they had more free disk space.
* If you run VictoriaMetrics on a host with 16 or more CPU cores, then it may be needed to tune the `-search.maxWorkersPerQuery` command-line flag
in order to improve query performance. If VictoriaMetrics serves big number of concurrent `select` queries, then try reducing the value for this flag.
If VcitoriaMetrics serves heavy queries, which select `>10K` of [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) and/or process `>100M`
of [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) per query, then try setting the value for this flag to the number of available CPU cores.
* VictoriaMetrics buffers incoming data in memory for up to a few seconds before flushing it to persistent storage.
This may lead to the following "issues":
@@ -1972,15 +2234,19 @@ and [cardinality explorer docs](#cardinality-explorer).
This suppresses default gap filling algorithm used by VictoriaMetrics - by default it assumes
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality) or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality)
or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined
via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
* New time series can be logged if `-logNewSeries` command-line flag is passed to VictoriaMetrics.
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag
and drops superflouos labels. This prevents from ingesting metrics with too many labels.
It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics. See [these docs](#selecting-graphite-metrics) for details.
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics.
See [these docs](#selecting-graphite-metrics) for details. You can also query Graphite metrics with [Graphite querying API](#graphite-render-api-usage).
* VictoriaMetrics ignores `NaN` values during data ingestion.
@@ -2122,7 +2388,8 @@ See also [high availability docs](#high-availability) and [backup docs](#backups
VictoriaMetrics supports backups via [vmbackup](https://docs.victoriametrics.com/vmbackup.html)
and [vmrestore](https://docs.victoriametrics.com/vmrestore.html) tools.
We also provide [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html) tool for enterprise subscribers.
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
## vmalert
@@ -2197,12 +2464,14 @@ Contact us with any questions regarding VictoriaMetrics at [info@victoriametrics
Feel free asking any questions regarding VictoriaMetrics:
* [slack](https://slack.victoriametrics.com/)
* [linkedin](https://www.linkedin.com/company/victoriametrics/)
* [reddit](https://www.reddit.com/r/VictoriaMetrics/)
* [telegram-en](https://t.me/VictoriaMetrics_en)
* [telegram-ru](https://t.me/VictoriaMetrics_ru1)
* [google groups](https://groups.google.com/forum/#!forum/victorametrics-users)
* [Slack](https://slack.victoriametrics.com/)
* [Twitter](https://twitter.com/VictoriaMetrics/)
* [Linkedin](https://www.linkedin.com/company/victoriametrics/)
* [Reddit](https://www.reddit.com/r/VictoriaMetrics/)
* [Telegram-en](https://t.me/VictoriaMetrics_en)
* [Telegram-ru](https://t.me/VictoriaMetrics_ru1)
* [Google groups](https://groups.google.com/forum/#!forum/victorametrics-users)
* [Mastodon](https://mastodon.social/@victoriametrics/)
If you like VictoriaMetrics and want to contribute, then we need the following:
@@ -2262,6 +2531,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
```
-bigMergeConcurrency int
Deprecated: this flag does nothing. Please use -smallMergeConcurrency for controlling the concurrency of background merges. See https://docs.victoriametrics.com/#storage
-blockcache.missesBeforeCaching int
The number of cache misses before putting the block into cache. Higher values may reduce indexdb/dataBlocks cache size at the cost of higher CPU and disk read usage (default 2)
-cacheExpireDuration duration
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
-configAuthKey string
@@ -2293,7 +2564,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-finalMergeDelay duration
The delay before starting final merge for per-month partition after no new data is ingested into it. Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. Zero value disables final merge
-flagsAuthKey string
@@ -2314,6 +2587,12 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.header.csp string
Value for 'Content-Security-Policy' header
-http.header.frameOptions string
Value for 'X-Frame-Options' header
-http.header.hsts string
Value for 'Strict-Transport-Security' header
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -2363,6 +2642,12 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-logNewSeries
Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics
-loggerDisableTimestamps
@@ -2375,6 +2660,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Allows renaming fields in JSON formatted logs. Example: "ts:timestamp,msg:message" renames "ts" to "timestamp" and "msg" to "message". Supported fields: ts, level, caller, msg
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerMaxArgLen int
The maximum length of a single logged argument. Longer arguments are replaced with 'arg_start..arg_end', where 'arg_start' and 'arg_end' is prefix and suffix of the arg with the length not exceeding -loggerMaxArgLen / 2 (default 500)
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-loggerTimezone string
@@ -2382,7 +2669,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
@@ -2397,6 +2684,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-newrelic.maxInsertRequestSize size
The maximum size in bytes of a single NewRelic request to /newrelic/infra/v2/metrics/events/bulk
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864)
-opentsdbHTTPListenAddr string
TCP address to listen for OpenTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty. See also -opentsdbHTTPListenAddr.useProxyProtocol
-opentsdbHTTPListenAddr.useProxyProtocol
@@ -2420,14 +2710,16 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1)
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberLabel string
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.memberNum string
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
@@ -2435,7 +2727,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-promscrape.config.strictParse
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
Interval for checking for changes in -promscrape.config file. By default, the checking is disabled. See how to reload -promscrape.config file at https://docs.victoriametrics.com/vmagent.html#configuration-update
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
@@ -2522,7 +2814,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Supports an array of values separated by comma or specified via multiple flags.
-retentionPeriod value
Data with timestamps outside the retentionPeriod is automatically deleted. The minimum retentionPeriod is 24h or 1d. See also -retentionFilter
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
-retentionTimezoneOffset duration
The offset for performing indexdb rotation. If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)
-search.cacheTimestampOffset duration
@@ -2538,7 +2830,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-search.latencyOffset duration
The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
-search.logQueryMemoryUsage size
Log queries, which require more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
Log query and increment vm_memory_intensive_queries_total metric each time the query requires more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
-search.logSlowQueryDuration duration
Log queries with execution time exceeding this value. Zero disables slow query logging. See also -search.logQueryMemoryUsage (default 5s)
@@ -2596,8 +2888,13 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
The maximum number of tag values returned from /api/v1/label/<label_name>/values (default 100000)
-search.maxUniqueTimeseries int
The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage (default 300000)
-search.maxWorkersPerQuery int
The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4)
-search.minStalenessInterval duration
The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval'
-search.minWindowForInstantRollupOptimization value
Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h)
-search.noStaleMarkers
Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets
-search.queryStats.lastQueriesCount int
@@ -2624,7 +2921,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
-snapshotsMaxAge value
Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
-sortLabels
Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit
-storage.cacheSizeIndexDBDataBlocks size

View File

@@ -5,8 +5,8 @@
| Version | Supported |
|---------|--------------------|
| [latest release](https://docs.victoriametrics.com/CHANGELOG.html) | :white_check_mark: |
| v1.93.x LTS release | :white_check_mark: |
| v1.87.x LTS release | :white_check_mark: |
| v1.79.x LTS release | :white_check_mark: |
| other releases | :x: |
## Reporting a Vulnerability

View File

@@ -3,6 +3,7 @@ package elasticsearch
import (
"bufio"
"errors"
"flag"
"fmt"
"io"
"math"
@@ -11,6 +12,8 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutils"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
@@ -21,7 +24,10 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
elasticsearchVersion = flag.String("elasticsearch.version", "8.9.0", "Elasticsearch version to report to client")
)
// RequestHandler processes Elasticsearch insert requests
@@ -60,9 +66,9 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
// See the latest available version for Elasticsearch at https://github.com/elastic/elasticsearch/releases
fmt.Fprintf(w, `{
"version": {
"number": "8.8.0"
"number": %q
}
}`)
}`, *elasticsearchVersion)
case http.MethodHead:
// Return empty response for Logstash ping request.
}
@@ -88,22 +94,32 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
httpserver.Errorf(w, r, "%s", err)
return true
}
if err := vlstorage.CanWriteData(); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
processLogMessage := cp.GetProcessLogMessageFunc(lr)
isGzip := r.Header.Get("Content-Encoding") == "gzip"
n, err := readBulkRequest(r.Body, isGzip, cp.TimeField, cp.MsgField, processLogMessage)
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
if err != nil {
logger.Warnf("cannot decode log message #%d in /_bulk request: %s", n, err)
return true
}
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
tookMs := time.Since(startTime).Milliseconds()
bw := bufferedwriter.Get(w)
defer bufferedwriter.Put(bw)
WriteBulkResponse(bw, n, tookMs)
_ = bw.Flush()
// update bulkRequestDuration only for successfully parsed requests
// There is no need in updating bulkRequestDuration for request errors,
// since their timings are usually much smaller than the timing for successful request parsing.
bulkRequestDuration.UpdateDuration(startTime)
return true
default:
return false
@@ -111,7 +127,9 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
}
var (
bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
bulkRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/elasticsearch/_bulk"}`)
)
func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
@@ -157,8 +175,6 @@ func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
var lineBufferPool bytesutil.ByteBufferPool
var rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
processLogMessage func(timestamp int64, fields []logstorage.Field),
) (bool, error) {
@@ -209,6 +225,7 @@ func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
p.RenameField(msgField, "_msg")
processLogMessage(ts, p.Fields)
logjson.PutParser(p)
return true, nil
}

View File

@@ -120,10 +120,10 @@ func compressData(s string) string {
var bb bytes.Buffer
zw := gzip.NewWriter(&bb)
if _, err := zw.Write([]byte(s)); err != nil {
panic(fmt.Errorf("unexpected error when compressing data: %s", err))
panic(fmt.Errorf("unexpected error when compressing data: %w", err))
}
if err := zw.Close(); err != nil {
panic(fmt.Errorf("unexpected error when closing gzip writer: %s", err))
panic(fmt.Errorf("unexpected error when closing gzip writer: %w", err))
}
return bb.String()
}

View File

@@ -43,7 +43,7 @@ func benchmarkReadBulkRequest(b *testing.B, isGzip bool) {
r.Reset(dataBytes)
_, err := readBulkRequest(r, isGzip, timeField, msgField, processLogMessage)
if err != nil {
panic(fmt.Errorf("unexpected error: %s", err))
panic(fmt.Errorf("unexpected error: %w", err))
}
}
})

View File

@@ -3,12 +3,13 @@ package insertutils
import (
"net/http"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/metrics"
)
// CommonParams contains common HTTP parameters used by log ingestion APIs.
@@ -73,12 +74,19 @@ func GetCommonParams(r *http.Request) (*CommonParams, error) {
// GetProcessLogMessageFunc returns a function, which adds parsed log messages to lr.
func (cp *CommonParams) GetProcessLogMessageFunc(lr *logstorage.LogRows) func(timestamp int64, fields []logstorage.Field) {
return func(timestamp int64, fields []logstorage.Field) {
if len(fields) > *MaxFieldsPerLine {
rf := logstorage.RowFormatter(fields)
logger.Warnf("dropping log line with %d fields; it exceeds -insert.maxFieldsPerLine=%d; %s", len(fields), *MaxFieldsPerLine, rf)
rowsDroppedTotalTooManyFields.Inc()
return
}
lr.MustAdd(cp.TenantID, timestamp, fields)
if cp.Debug {
s := lr.GetRowString(0)
lr.ResetKeepSettings()
logger.Infof("remoteAddr=%s; requestURI=%s; ignoring log entry because of `debug` query arg: %s", cp.DebugRemoteAddr, cp.DebugRequestURI, s)
rowsDroppedTotal.Inc()
rowsDroppedTotalDebug.Inc()
return
}
if lr.NeedFlush() {
@@ -88,4 +96,5 @@ func (cp *CommonParams) GetProcessLogMessageFunc(lr *logstorage.LogRows) func(ti
}
}
var rowsDroppedTotal = metrics.NewCounter(`vl_rows_dropped_total{reason="debug"}`)
var rowsDroppedTotalDebug = metrics.NewCounter(`vl_rows_dropped_total{reason="debug"}`)
var rowsDroppedTotalTooManyFields = metrics.NewCounter(`vl_rows_dropped_total{reason="too_many_fields"}`)

View File

@@ -1,10 +1,15 @@
package insertutils
import (
"flag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
)
var (
// MaxLineSizeBytes is the maximum length of a single line for /insert/* handlers
MaxLineSizeBytes = flagutil.NewBytes("insert.maxLineSizeBytes", 256*1024, "The maximum size of a single line, which can be read by /insert/* handlers")
// MaxFieldsPerLine is the maximum number of fields per line for /insert/* handlers
MaxFieldsPerLine = flag.Int("insert.maxFieldsPerLine", 1000, "The maximum number of log fields per line, which can be read by /insert/* handlers")
)

View File

@@ -21,6 +21,7 @@ import (
// RequestHandler processes jsonline insert requests
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
startTime := time.Now()
w.Header().Add("Content-Type", "application/json")
if r.Method != "POST" {
@@ -35,6 +36,10 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
httpserver.Errorf(w, r, "%s", err)
return true
}
if err := vlstorage.CanWriteData(); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
processLogMessage := cp.GetProcessLogMessageFunc(lr)
@@ -77,6 +82,11 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
// update jsonlineRequestDuration only for successfully parsed requests.
// There is no need in updating jsonlineRequestDuration for request errors,
// since their timings are usually much smaller than the timing for successful request parsing.
jsonlineRequestDuration.UpdateDuration(startTime)
return true
}
@@ -109,6 +119,7 @@ func readLine(sc *bufio.Scanner, timeField, msgField string, processLogMessage f
p.RenameField(msgField, "_msg")
processLogMessage(ts, p.Fields)
logjson.PutParser(p)
return true, nil
}
@@ -144,6 +155,7 @@ func parseISO8601Timestamp(s string) (int64, error) {
var lineBufferPool bytesutil.ByteBufferPool
var (
requestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
requestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
jsonlineRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/jsonline"}`)
)

View File

@@ -5,29 +5,31 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/metrics"
)
var (
lokiRequestsJSONTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
lokiRequestsProtobufTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
)
// RequestHandler processes Loki insert requests
//
// See https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
if path != "/api/v1/push" {
switch path {
case "/api/v1/push":
return handleInsert(r, w)
case "/ready":
// See https://grafana.com/docs/loki/latest/api/#identify-ready-loki-instance
w.WriteHeader(http.StatusOK)
w.Write([]byte("ready"))
return true
default:
return false
}
}
// See https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
func handleInsert(r *http.Request, w http.ResponseWriter) bool {
contentType := r.Header.Get("Content-Type")
switch contentType {
case "application/json":
lokiRequestsJSONTotal.Inc()
return handleJSON(r, w)
default:
// Protobuf request body should be handled by default accoring to https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
lokiRequestsProtobufTotal.Inc()
// Protobuf request body should be handled by default according to https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
return handleProtobuf(r, w)
}
}

View File

@@ -18,12 +18,11 @@ import (
"github.com/valyala/fastjson"
)
var (
rowsIngestedJSONTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
parserPool fastjson.ParserPool
)
var parserPool fastjson.ParserPool
func handleJSON(r *http.Request, w http.ResponseWriter) bool {
startTime := time.Now()
lokiRequestsJSONTotal.Inc()
reader := r.Body
if r.Header.Get("Content-Encoding") == "gzip" {
zr, err := common.GetGzipReader(reader)
@@ -48,19 +47,36 @@ func handleJSON(r *http.Request, w http.ResponseWriter) bool {
httpserver.Errorf(w, r, "cannot parse common params from request: %s", err)
return true
}
if err := vlstorage.CanWriteData(); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
processLogMessage := cp.GetProcessLogMessageFunc(lr)
n, err := parseJSONRequest(data, processLogMessage)
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
if err != nil {
httpserver.Errorf(w, r, "cannot parse Loki request: %s", err)
httpserver.Errorf(w, r, "cannot parse Loki json request: %s", err)
return true
}
rowsIngestedJSONTotal.Add(n)
// update lokiRequestJSONDuration only for successfully parsed requests
// There is no need in updating lokiRequestJSONDuration for request errors,
// since their timings are usually much smaller than the timing for successful request parsing.
lokiRequestJSONDuration.UpdateDuration(startTime)
return true
}
var (
lokiRequestsJSONTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
rowsIngestedJSONTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
lokiRequestJSONDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="json"}`)
)
func parseJSONRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
p := parserPool.Get()
defer parserPool.Put(p)
@@ -155,7 +171,6 @@ func parseJSONRequest(data []byte, processLogMessage func(timestamp int64, field
Value: bytesutil.ToUnsafeString(msg),
})
processLogMessage(ts, fields)
}
rowsIngested += len(lines)
}

View File

@@ -29,7 +29,7 @@ func benchmarkParseJSONRequest(b *testing.B, streams, rows, labels int) {
for pb.Next() {
_, err := parseJSONRequest(data, func(timestamp int64, fields []logstorage.Field) {})
if err != nil {
panic(fmt.Errorf("unexpected error: %s", err))
panic(fmt.Errorf("unexpected error: %w", err))
}
}
})

View File

@@ -19,12 +19,13 @@ import (
)
var (
rowsIngestedProtobufTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
bytesBufPool bytesutil.ByteBufferPool
pushReqsPool sync.Pool
bytesBufPool bytesutil.ByteBufferPool
pushReqsPool sync.Pool
)
func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
startTime := time.Now()
lokiRequestsProtobufTotal.Inc()
wcr := writeconcurrencylimiter.GetReader(r.Body)
data, err := io.ReadAll(wcr)
writeconcurrencylimiter.PutReader(wcr)
@@ -38,19 +39,36 @@ func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
httpserver.Errorf(w, r, "cannot parse common params from request: %s", err)
return true
}
if err := vlstorage.CanWriteData(); err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
processLogMessage := cp.GetProcessLogMessageFunc(lr)
n, err := parseProtobufRequest(data, processLogMessage)
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
if err != nil {
httpserver.Errorf(w, r, "cannot parse loki request: %s", err)
httpserver.Errorf(w, r, "cannot parse Loki protobuf request: %s", err)
return true
}
rowsIngestedProtobufTotal.Add(n)
// update lokiRequestProtobufDuration only for successfully parsed requests
// There is no need in updating lokiRequestProtobufDuration for request errors,
// since their timings are usually much smaller than the timing for successful request parsing.
lokiRequestProtobufDuration.UpdateDuration(startTime)
return true
}
var (
lokiRequestsProtobufTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
rowsIngestedProtobufTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
lokiRequestProtobufDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="protobuf"}`)
)
func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
bb := bytesBufPool.Get()
defer bytesBufPool.Put(bb)
@@ -66,7 +84,7 @@ func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, f
err = req.Unmarshal(bb.B)
if err != nil {
return 0, fmt.Errorf("cannot parse request body: %s", err)
return 0, fmt.Errorf("cannot parse request body: %w", err)
}
var commonFields []logstorage.Field
@@ -79,7 +97,7 @@ func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, f
// Labels are same for all entries in the stream.
commonFields, err = parsePromLabels(commonFields[:0], stream.Labels)
if err != nil {
return rowsIngested, fmt.Errorf("cannot parse stream labels %q: %s", stream.Labels, err)
return rowsIngested, fmt.Errorf("cannot parse stream labels %q: %w", stream.Labels, err)
}
fields := commonFields

View File

@@ -6,8 +6,9 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
)
func BenchmarkParseProtobufRequest(b *testing.B) {
@@ -30,7 +31,7 @@ func benchmarkParseProtobufRequest(b *testing.B, streams, rows, labels int) {
for pb.Next() {
_, err := parseProtobufRequest(body, func(timestamp int64, fields []logstorage.Field) {})
if err != nil {
panic(fmt.Errorf("unexpected error: %s", err))
panic(fmt.Errorf("unexpected error: %w", err))
}
}
})

View File

@@ -88,6 +88,12 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
return true
}
if strings.HasPrefix(path, "/vmui/") {
if strings.HasPrefix(path, "/vmui/static/") {
// Allow clients caching static contents for long period of time, since it shouldn't change over time.
// Path to static contents (such as js and css) must be changed whenever its contents is changed.
// See https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/
w.Header().Set("Cache-Control", "max-age=31536000")
}
r.URL.Path = path
vmuiFileServer.ServeHTTP(w, r)
return true

View File

@@ -1,14 +1,13 @@
{
"files": {
"main.css": "./static/css/main.5f461a27.css",
"main.js": "./static/js/main.7566144a.js",
"static/js/522.b5ae4365.chunk.js": "./static/js/522.b5ae4365.chunk.js",
"static/media/Lato-Regular.ttf": "./static/media/Lato-Regular.d714fec1633b69a9c2e9.ttf",
"static/media/Lato-Bold.ttf": "./static/media/Lato-Bold.32360ba4b57802daa4d6.ttf",
"main.css": "./static/css/main.d1313636.css",
"main.js": "./static/js/main.1919fefe.js",
"static/js/522.da77e7b3.chunk.js": "./static/js/522.da77e7b3.chunk.js",
"static/media/MetricsQL.md": "./static/media/MetricsQL.8644fd7c964802dd34a9.md",
"index.html": "./index.html"
},
"entrypoints": [
"static/css/main.5f461a27.css",
"static/js/main.7566144a.js"
"static/css/main.d1313636.css",
"static/js/main.1919fefe.js"
]
}

View File

@@ -1 +1 @@
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.7566144a.js"></script><link href="./static/css/main.5f461a27.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.1919fefe.js"></script><link href="./static/css/main.d1313636.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -7,7 +7,7 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/**
* @remix-run/router v1.7.2
* @remix-run/router v1.10.0
*
* Copyright (c) Remix Software Inc.
*
@@ -18,7 +18,7 @@
*/
/**
* React Router DOM v6.14.2
* React Router DOM v6.17.0
*
* Copyright (c) Remix Software Inc.
*
@@ -29,7 +29,7 @@
*/
/**
* React Router v6.14.2
* React Router v6.17.0
*
* Copyright (c) Remix Software Inc.
*

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -3,14 +3,17 @@ package vlstorage
import (
"flag"
"fmt"
"net/http"
"sync"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/metrics"
)
var (
@@ -29,6 +32,8 @@ var (
"see https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields ; see also -logIngestedRows")
logIngestedRows = flag.Bool("logIngestedRows", false, "Whether to log all the ingested log entries; this can be useful for debugging of data ingestion; "+
"see https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/ ; see also -logNewStreams")
minFreeDiskSpaceBytes = flagutil.NewBytes("storage.minFreeDiskSpaceBytes", 10e6, "The minimum free disk space at -storageDataPath after which "+
"the storage stops accepting new data")
)
// Init initializes vlstorage.
@@ -39,15 +44,16 @@ func Init() {
logger.Panicf("BUG: Init() has been already called")
}
if retentionPeriod.Msecs < 24*3600*1000 {
if retentionPeriod.Duration() < 24*time.Hour {
logger.Fatalf("-retentionPeriod cannot be smaller than a day; got %s", retentionPeriod)
}
cfg := &logstorage.StorageConfig{
Retention: time.Millisecond * time.Duration(retentionPeriod.Msecs),
FlushInterval: *inmemoryDataFlushInterval,
FutureRetention: time.Millisecond * time.Duration(futureRetention.Msecs),
LogNewStreams: *logNewStreams,
LogIngestedRows: *logIngestedRows,
Retention: retentionPeriod.Duration(),
FlushInterval: *inmemoryDataFlushInterval,
FutureRetention: futureRetention.Duration(),
LogNewStreams: *logNewStreams,
LogIngestedRows: *logIngestedRows,
MinFreeDiskSpaceBytes: minFreeDiskSpaceBytes.N,
}
logger.Infof("opening storage at -storageDataPath=%s", *storageDataPath)
startTime := time.Now()
@@ -74,7 +80,21 @@ func Stop() {
var strg *logstorage.Storage
var storageMetrics *metrics.Set
// CanWriteData returns non-nil error if it cannot write data to vlstorage.
func CanWriteData() error {
if strg.IsReadOnly() {
return &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("cannot add rows into storage in read-only mode; the storage can be in read-only mode "+
"because of lack of free disk space at -storageDataPath=%s", *storageDataPath),
StatusCode: http.StatusTooManyRequests,
}
}
return nil
}
// MustAddRows adds lr to vlstorage
//
// It is advised to call CanWriteData() before calling MustAddRows()
func MustAddRows(lr *logstorage.LogRows) {
strg.MustAddRows(lr)
}
@@ -107,6 +127,12 @@ func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
ms.NewGauge(fmt.Sprintf(`vl_free_disk_space_bytes{path=%q}`, *storageDataPath), func() float64 {
return float64(fs.MustGetFreeSpace(*storageDataPath))
})
ms.NewGauge(fmt.Sprintf(`vl_storage_is_read_only{path=%q}`, *storageDataPath), func() float64 {
if m().IsReadOnly {
return 1
}
return 0
})
ms.NewGauge(`vl_active_merges{type="inmemory"}`, func() float64 {
return float64(m().InmemoryActiveMerges)

View File

@@ -3,7 +3,8 @@
`vmagent` is a tiny agent which helps you collect metrics from various sources,
[relabel and filter the collected metrics](#relabeling)
and store them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other storage systems via Prometheus `remote_write` protocol.
or any other storage systems via Prometheus `remote_write` protocol
or via [VictoriaMetrics `remote_write` protocol](#victoriametrics-remote-write-protocol).
See [Quick Start](#quick-start) for details.
@@ -45,7 +46,7 @@ additionally to [discovering Prometheus-compatible targets and scraping metrics
## Quick Start
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) (
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) (
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags)),
unpack it and pass the following flags to the `vmagent` binary in order to start scraping Prometheus-compatible targets
and sending the data to the Prometheus-compatible remote storage:
@@ -59,7 +60,7 @@ and sending the data to the Prometheus-compatible remote storage:
Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent)
to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`:
```console
```bash
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
@@ -68,7 +69,7 @@ the data to [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-V
Example command for scraping Prometheus targets and writing the data to single-node VictoriaMetrics:
```console
```bash
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
@@ -94,6 +95,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
* InfluxDB line protocol via `http://<vmagent>:8429/write`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTelemetry http API. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#sending-data-via-opentelemetry).
* NewRelic API. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
@@ -109,7 +111,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
* Sending `SIGHUP` signal to `vmagent` process:
```console
```bash
kill -SIGHUP `pidof vmagent`
```
@@ -172,6 +174,13 @@ by routing outgoing samples for the same time series of [counter](https://docs.v
and [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram) types from top-level `vmagent` instances
to the same second-level `vmagent` instance, so they are aggregated properly.
If `-remoteWrite.shardByURL` command-line flag is set, then all the metric labels are used for even sharding
among remote storage systems specified in `-remoteWrite.url`. Sometimes it may be needed to use only a particular
set of labels for sharding. For example, it may be needed to route all the metrics with the same `instance` label
to the same `-remoteWrite.url`. In this case you can specify comma-separated list of these labels in the `-remoteWrite.shardByURL.labels`
command-line flag. For example, `-remoteWrite.shardByURL.labels=instance,__name__` would shard metrics with the same name and `instance`
label to the same `-remoteWrite.url`.
See also [how to scrape big number of targets](#scraping-big-number-of-targets).
### Relabeling and filtering
@@ -317,7 +326,7 @@ in the `scrape_config_files` section of `-promscrape.config` file. For example,
loading scrape configs from all the `*.yml` files under `configs` directory, from `single_scrape_config.yml` local file
and from `https://config-server/scrape_config.yml` url:
```yml
```yaml
scrape_config_files:
- configs/*.yml
- single_scrape_config.yml
@@ -327,7 +336,7 @@ scrape_config_files:
Every referred file can contain arbitrary number of [supported scrape configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs).
There is no need in specifying top-level `scrape_configs` section in these files. For example:
```yml
```yaml
- job_name: foo
static_configs:
- targets: ["vmagent:8429"]
@@ -367,7 +376,7 @@ Extra labels can be added to metrics collected by `vmagent` via the following me
For example, the following command starts `vmagent`, which adds `{datacenter="foobar"}` label to all the metrics pushed
to all the configured remote storage systems (all the `-remoteWrite.url` flag values):
```
```bash
/path/to/vmagent -remoteWrite.label=datacenter=foobar ...
```
@@ -495,13 +504,16 @@ with [additional enhancements](#relabeling-enhancements). The relabeling can be
This relabeling can be debugged via `http://vmagent:8429/metric-relabel-debug` page. See [these docs](#relabel-debug) for details.
* At the `-remoteWrite.urlRelabelConfig` files. This relabeling is used for modifying labels for metrics
and for dropping unneeded metrics before sending them to a particular `-remoteWrite.url`.
and for dropping unneeded metrics before sending them to the particular `-remoteWrite.url`.
This relabeling can be debugged via `http://vmagent:8429/metric-relabel-debug` page. See [these docs](#relabel-debug) for details.
All the files with relabeling configs can contain special placeholders in the form `%{ENV_VAR}`,
which are replaced by the corresponding environment variable values.
[Streaming aggregation](https://docs.victoriametrics.com/stream-aggregation.html), if configured,
is pefrormed after applying all the relabeling stages mentioned above.
The following articles contain useful information about Prometheus relabeling:
* [Cookbook for common relabeling tasks](https://docs.victoriametrics.com/relabeling.html)
@@ -732,7 +744,7 @@ stream parsing mode can be explicitly enabled in the following places:
Examples:
```yml
```yaml
scrape_configs:
- job_name: 'big-federate'
stream_parse: true
@@ -754,24 +766,24 @@ as soon as it is parsed in stream parsing mode.
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` horizontal scaling, sharding and clustering).
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
spread scrape targets among a cluster of two `vmagent` instances:
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag.
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values
in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster specified via `-promscrape.cluster.membersCount`.
For example, the following commands spread scrape targets among a cluster of two `vmagent` instances:
```
```text
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
```
The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes.
The pod name must end with a number in the range `0 ... promscrape.cluster.memberNum-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
The pod name must end with a number in the range `0 ... promscrape.cluster.membersCount-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
By default, each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances:
```
```text
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=2 -promscrape.config=/path/to/config.yml ...
@@ -781,6 +793,14 @@ If each target is scraped by multiple `vmagent` instances, then data deduplicati
The `-dedup.minScrapeInterval` must be set to the `scrape_interval` configured at `-promscrape.config`.
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
The `-promscrape.cluster.memberLabel` command-line flag allows specifying a name for `member num` label to add to all the scraped metrics.
The value of the `member num` label is set to `-promscrape.cluster.memberNum`. For example, the following config instructs adding `vmagent_instance="0"` label
to all the metrics scraped by the given `vmagent` instance:
```text
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.cluster.memberLabel=vmagent_instance
```
See also [how to shard data among multiple remote storage systems](#sharding-among-remote-storages).
@@ -804,7 +824,7 @@ See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679)
`vmagent` supports scraping targets via http, https and socks5 proxies. Proxy address must be specified in `proxy_url` option. For example, the following scrape config instructs
target scraping via https proxy at `https://proxy-addr:1234`:
```yml
```yaml
scrape_configs:
- job_name: foo
proxy_url: https://proxy-addr:1234
@@ -821,7 +841,7 @@ Proxy can be configured with the following optional settings:
For example:
```yml
```yaml
scrape_configs:
- job_name: foo
proxy_url: https://proxy-addr:1234
@@ -971,7 +991,7 @@ If you have suggestions for improvements or have found a bug - please open an is
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at the beginning of some interval,
then `scrape_align_interval` option must be used. For example, the following config aligns hourly scrapes to the beginning of hour:
```yml
```yaml
scrape_configs:
- job_name: foo
scrape_interval: 1h
@@ -981,7 +1001,7 @@ If you have suggestions for improvements or have found a bug - please open an is
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at specific offset, then `scrape_offset` option must be used.
For example, the following config instructs `vmagent` to scrape the target at 10 seconds of every minute:
```yml
```yaml
scrape_configs:
- job_name: foo
scrape_interval: 1m
@@ -994,14 +1014,14 @@ If you have suggestions for improvements or have found a bug - please open an is
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
```yml
```yaml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
The following relabeling rule may be added to `relabel_configs` section in order to filter out init container pods:
```yml
```yaml
- action: drop
source_labels: [__meta_kubernetes_pod_container_init]
regex: true
@@ -1016,8 +1036,9 @@ See also [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting
* [Reading metrics from Kafka](#reading-metrics-from-kafka)
* [Writing metrics to Kafka](#writing-metrics-to-kafka)
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
in `vmutils-...-enterprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
### Reading metrics from Kafka
@@ -1044,7 +1065,7 @@ For example, `-kafka.consumer.topic.brokers=host1:9092;host2:9092`.
The following command starts `vmagent`, which reads metrics in InfluxDB line protocol format from Kafka broker at `localhost:9092`
from the topic `metrics-by-telegraf` and sends them to remote storage at `http://localhost:8428/api/v1/write`:
```console
```bash
./bin/vmagent -remoteWrite.url=http://localhost:8428/api/v1/write \
-kafka.consumer.topic.brokers=localhost:9092 \
-kafka.consumer.topic.format=influx \
@@ -1064,10 +1085,10 @@ data_format = "influx"
#### Command-line flags for Kafka consumer
These command-line flags are available only in [enterprise](https://docs.victoriametrics.com/enterprise.html) version of `vmagent`,
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
(see `vmutils-...-enterprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
```
```text
-kafka.consumer.topic array
Kafka topic names for data consumption.
Supports an array of values separated by comma or specified via multiple flags.
@@ -1112,25 +1133,25 @@ Two types of auth are supported:
* sasl with username and password:
```console
```bash
./bin/vmagent -remoteWrite.url=kafka://localhost:9092/?topic=prom-rw&security.protocol=SASL_SSL&sasl.mechanisms=PLAIN -remoteWrite.basicAuth.username=user -remoteWrite.basicAuth.password=password
```
* tls certificates:
```console
```bash
./bin/vmagent -remoteWrite.url=kafka://localhost:9092/?topic=prom-rw&security.protocol=SSL -remoteWrite.tlsCAFile=/opt/ca.pem -remoteWrite.tlsCertFile=/opt/cert.pem -remoteWrite.tlsKeyFile=/opt/key.pem
```
## How to build from sources
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-...` archives.
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - `vmagent` is located in the `vmutils-...` archives.
It may be needed to build `vmagent` from source code when developing or testing new feature or bugfix.
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make vmagent` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds the `vmagent` binary and puts it into the `bin` folder.
@@ -1149,7 +1170,7 @@ The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```console
```bash
ROOT_IMAGE=scratch make package-vmagent
```
@@ -1159,7 +1180,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make vmagent-linux-arm` or `make vmagent-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
It builds `vmagent-linux-arm` or `vmagent-linux-arm64` binary respectively and puts it into the `bin` folder.
@@ -1177,7 +1198,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
<div class="with-copy" markdown="1">
```console
```bash
curl http://0.0.0.0:8429/debug/pprof/heap > mem.pprof
```
@@ -1187,7 +1208,7 @@ curl http://0.0.0.0:8429/debug/pprof/heap > mem.pprof
<div class="with-copy" markdown="1">
```console
```bash
curl http://0.0.0.0:8429/debug/pprof/profile > cpu.pprof
```
@@ -1203,7 +1224,7 @@ It is safe sharing the collected profiles from security point of view, since the
`vmagent` can be fine-tuned with various command-line flags. Run `./vmagent -help` in order to see the full list of these flags with their descriptions and default values:
```
```text
./vmagent -help
vmagent collects metrics data via popular data ingestion protocols and routes them to VictoriaMetrics.
@@ -1212,10 +1233,6 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-cacheExpireDuration duration
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
-clients.docker
Decides whether a docker container be brought up automatically
-clients.semaphore
Tells if the job is running on Semaphore
-configAuthKey string
Authorization key for accessing /config page. It must be passed via authKey query arg
-csvTrimTimestamp duration
@@ -1236,7 +1253,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@@ -1251,6 +1270,12 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.header.csp string
Value for 'Content-Security-Policy' header
-http.header.frameOptions string
Value for 'X-Frame-Options' header
-http.header.hsts string
Value for 'Strict-Transport-Security' header
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -1299,34 +1324,40 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-kafka.consumer.topic array
Kafka topic names for data consumption. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Kafka topic names for data consumption. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.basicAuth.password array
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.basicAuth.username array
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.brokers array
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.concurrency array
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default 1)
Supports array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.defaultFormat string
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
-kafka.consumer.topic.format array
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.groupID array
Defines group.id for topic. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Defines group.id for topic. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.isGzipped array
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports array of values separated by comma or specified via multiple flags.
-kafka.consumer.topic.options array
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-license string
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@@ -1344,7 +1375,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
@@ -1355,6 +1386,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-newrelic.maxInsertRequestSize size
The maximum size in bytes of a single NewRelic request to /newrelic/infra/v2/metrics/events/bulk
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864)
-opentsdbHTTPListenAddr string
TCP address to listen for OpenTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty. See also -opentsdbHTTPListenAddr.useProxyProtocol
-opentsdbHTTPListenAddr.useProxyProtocol
@@ -1376,14 +1410,16 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1)
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberLabel string
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.memberNum string
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
@@ -1391,7 +1427,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-promscrape.config.strictParse
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
Interval for checking for changes in -promscrape.config file. By default, the checking is disabled. See how to reload -promscrape.config file at https://docs.victoriametrics.com/vmagent.html#configuration-update
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
@@ -1533,7 +1569,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
-remoteWrite.maxDiskUsagePerURL array
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. Disk usage is unlimited if the value is set to 0
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB.
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB. (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.maxHourlySeries int
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
@@ -1563,28 +1599,31 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-remoteWrite.queues int
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
-remoteWrite.rateLimit array
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.relabelConfig string
Optional path to file with relabeling configs, which are applied to all the metrics before sending them to -remoteWrite.url. See also -remoteWrite.urlRelabelConfig. The path can point either to local file or to http url. See https://docs.victoriametrics.com/vmagent.html#relabeling
-remoteWrite.roundDigits array
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics (default 100)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.sendTimeout array
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m0s)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.shardByURL
Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages
-remoteWrite.shardByURL.labels array
Optional list of labels, which must be used for sharding outgoing samples among remote storage systems if -remoteWrite.shardByURL command-line flag is set. By default all the labels are used for sharding in order to gain even distribution of series over the specified -remoteWrite.url systems
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.showURL
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-remoteWrite.significantFigures array
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits (default 0)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.config array
Optional path to file with stream aggregation config. See https://docs.victoriametrics.com/stream-aggregation.html . See also -remoteWrite.streamAggr.keepInput, -remoteWrite.streamAggr.dropInput and -remoteWrite.streamAggr.dedupInterval
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.dedupInterval array
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero (default 0s)
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.dropInput array
Whether to drop all the input samples after the aggregation with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/stream-aggregation.html

View File

@@ -8,7 +8,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog/stream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
"github.com/VictoriaMetrics/metrics"
@@ -29,12 +29,12 @@ func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
return err
}
ce := req.Header.Get("Content-Encoding")
return stream.Parse(req.Body, ce, func(series []parser.Series) error {
return stream.Parse(req.Body, ce, func(series []datadog.Series) error {
return insertRows(at, series, extraLabels)
})
}
func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmarshal.Label) error {
func insertRows(at *auth.Token, series []datadog.Series, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@@ -63,7 +63,7 @@ func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmars
})
}
for _, tag := range ss.Tags {
name, value := parser.SplitTag(tag)
name, value := datadog.SplitTag(tag)
if name == "host" {
name = "exported_host"
}

View File

@@ -11,11 +11,14 @@ import (
"sync/atomic"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/datadog"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/newrelic"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentelemetry"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdbhttp"
@@ -39,7 +42,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
"github.com/VictoriaMetrics/metrics"
)
var (
@@ -208,7 +210,7 @@ func getAuthTokenFromPath(path string) (*auth.Token, error) {
if p.Suffix != "opentsdb/api/put" {
return nil, fmt.Errorf("unsupported path requested: %q; expecting 'opentsdb/api/put'", p.Suffix)
}
return auth.NewToken(p.AuthToken)
return auth.NewTokenPossibleMultitenant(p.AuthToken)
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
@@ -251,7 +253,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
w.WriteHeader(statusCode)
return true
}
if strings.HasPrefix(path, "datadog/") {
if strings.HasPrefix(path, "/datadog/") {
// Trim suffix from paths starting from /datadog/ in order to support legacy DataDog agent.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2670
path = strings.TrimSuffix(path, "/")
@@ -318,6 +320,29 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusOK)
return true
case "/newrelic":
newrelicCheckRequest.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/newrelic/inventory/deltas":
newrelicInventoryRequests.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
return true
case "/newrelic/infra/v2/metrics/events/bulk":
newrelicWriteRequests.Inc()
if err := newrelic.InsertHandlerForHTTP(nil, r); err != nil {
newrelicWriteErrors.Inc()
httpserver.Errorf(w, r, "%s", err)
return true
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/datadog/api/v1/series":
datadogWriteRequests.Inc()
if err := datadog.InsertHandlerForHTTP(nil, r); err != nil {
@@ -518,6 +543,29 @@ func processMultitenantRequest(w http.ResponseWriter, r *http.Request, path stri
}
w.WriteHeader(http.StatusOK)
return true
case "newrelic":
newrelicCheckRequest.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "newrelic/inventory/deltas":
newrelicInventoryRequests.Inc()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
return true
case "newrelic/infra/v2/metrics/events/bulk":
newrelicWriteRequests.Inc()
if err := newrelic.InsertHandlerForHTTP(at, r); err != nil {
newrelicWriteErrors.Inc()
httpserver.Errorf(w, r, "%s", err)
return true
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(202)
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "datadog/api/v1/series":
datadogWriteRequests.Inc()
if err := datadog.InsertHandlerForHTTP(at, r); err != nil {
@@ -590,6 +638,12 @@ var (
opentelemetryPushRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
opentelemetryPushErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/inventory/deltas", protocol="newrelic"}`)
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic", protocol="newrelic"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeServiceDiscoveryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/service-discovery"}`)

View File

@@ -0,0 +1,86 @@
package newrelic
import (
"net/http"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/newrelic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/newrelic/stream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="newrelic"}`)
rowsTenantInserted = tenantmetrics.NewCounterMap(`vmagent_tenant_inserted_rows_total{type="newrelic"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="newrelic"}`)
)
// InsertHandlerForHTTP processes remote write for NewRelic POST /infra/v2/metrics/events/bulk request.
func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
ce := req.Header.Get("Content-Encoding")
isGzip := ce == "gzip"
return stream.Parse(req.Body, isGzip, func(rows []newrelic.Row) error {
return insertRows(at, rows, extraLabels)
})
}
func insertRows(at *auth.Token, rows []newrelic.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
samplesCount := 0
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
tags := r.Tags
srcSamples := r.Samples
for j := range srcSamples {
s := &srcSamples[j]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: bytesutil.ToUnsafeString(s.Name),
})
for k := range tags {
t := &tags[k]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(t.Key),
Value: bytesutil.ToUnsafeString(t.Value),
})
}
samples = append(samples, prompbmarshal.Sample{
Value: s.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
labels = append(labels, extraLabels...)
}
samplesCount += len(srcSamples)
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(at, &ctx.WriteRequest)
rowsInserted.Add(len(rows))
if at != nil {
rowsTenantInserted.Get(at).Add(samplesCount)
}
rowsPerInsert.Update(float64(samplesCount))
return nil
}

View File

@@ -1,6 +1,7 @@
package opentelemetry
import (
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
@@ -26,6 +27,9 @@ func InsertHandler(at *auth.Token, req *http.Request) error {
return err
}
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
if req.Header.Get("Content-Type") == "application/json" {
return fmt.Errorf("json encoding isn't supported for opentelemetry format. Use protobuf encoding")
}
return stream.ParseStream(req.Body, isGzipped, func(tss []prompbmarshal.TimeSeries) error {
return insertRows(at, tss, extraLabels)
})

View File

@@ -32,7 +32,7 @@ func InsertHandler(at *auth.Token, req *http.Request) error {
return err
}
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
return stream.Parse(req.Body, defaultTimestamp, isGzipped, func(rows []parser.Row) error {
return stream.Parse(req.Body, defaultTimestamp, isGzipped, true, func(rows []parser.Row) error {
return insertRows(at, rows, extraLabels)
}, func(s string) {
httpserver.LogError(req, s)

View File

@@ -2,6 +2,7 @@ package remotewrite
import (
"bytes"
"errors"
"fmt"
"io"
"net/http"
@@ -26,10 +27,10 @@ var (
forceVMProto = flagutil.NewArrayBool("remoteWrite.forceVMProto", "Whether to force VictoriaMetrics remote write protocol for sending data "+
"to the corresponding -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol")
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", 0, "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
"By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
"is sent after temporary unavailability of the remote storage")
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)")
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to the corresponding -remoteWrite.url")
proxyURL = flagutil.NewArrayString("remoteWrite.proxyURL", "Optional proxy URL for writing data to the corresponding -remoteWrite.url. "+
"Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234")
@@ -105,12 +106,15 @@ type client struct {
func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
authCfg, err := getAuthConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize auth config for remoteWrite.url=%q: %s", remoteWriteURL, err)
logger.Fatalf("cannot initialize auth config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
}
tlsCfg, err := authCfg.NewTLSConfig()
if err != nil {
logger.Fatalf("cannot initialize tls config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
}
tlsCfg := authCfg.NewTLSConfig()
awsCfg, err := getAWSAPIConfig(argIdx)
if err != nil {
logger.Fatalf("FATAL: cannot initialize AWS Config for remoteWrite.url=%q: %s", remoteWriteURL, err)
logger.Fatalf("cannot initialize AWS Config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
}
tr := &http.Transport{
DialContext: statDial,
@@ -134,7 +138,7 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste
}
hc := &http.Client{
Transport: tr,
Timeout: sendTimeout.GetOptionalArgOrDefault(argIdx, time.Minute),
Timeout: sendTimeout.GetOptionalArg(argIdx),
}
c := &client{
sanitizedURL: sanitizedURL,
@@ -169,7 +173,7 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste
}
func (c *client) init(argIdx, concurrency int, sanitizedURL string) {
if bytesPerSec := rateLimit.GetOptionalArgOrDefault(argIdx, 0); bytesPerSec > 0 {
if bytesPerSec := rateLimit.GetOptionalArg(argIdx); bytesPerSec > 0 {
logger.Infof("applying %d bytes per second rate limit for -remoteWrite.url=%q", bytesPerSec, sanitizedURL)
c.rl.perSecondLimit = int64(bytesPerSec)
}
@@ -178,7 +182,7 @@ func (c *client) init(argIdx, concurrency int, sanitizedURL string) {
c.bytesSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_bytes_sent_total{url=%q}`, c.sanitizedURL))
c.blocksSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_blocks_sent_total{url=%q}`, c.sanitizedURL))
c.rateLimit = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_rate_limit{url=%q}`, c.sanitizedURL), func() float64 {
return float64(rateLimit.GetOptionalArgOrDefault(argIdx, 0))
return float64(rateLimit.GetOptionalArg(argIdx))
})
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.sanitizedURL))
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.sanitizedURL))
@@ -322,12 +326,42 @@ func (c *client) runWorker() {
}
func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
req, err := c.newRequest(url, body)
if err != nil {
return nil, err
}
resp, err := c.hc.Do(req)
if err == nil {
return resp, nil
}
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
return nil, err
}
// It is likely connection became stale or timed out during the first request.
// Make another attempt in hope request will succeed.
// If not, the error should be handled by the caller as usual.
// This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139
req, err = c.newRequest(url, body)
if err != nil {
return nil, fmt.Errorf("second attempt: %w", err)
}
resp, err = c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("second attempt: %w", err)
}
return resp, nil
}
func (c *client) newRequest(url string, body []byte) (*http.Request, error) {
reqBody := bytes.NewBuffer(body)
req, err := http.NewRequest(http.MethodPost, url, reqBody)
if err != nil {
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", url, err)
}
c.authCfg.SetHeaders(req, true)
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
h := req.Header
h.Set("User-Agent", "vmagent")
h.Set("Content-Type", "application/x-protobuf")
@@ -341,11 +375,10 @@ func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
if c.awsCfg != nil {
sigv4Hash := awsapi.HashHex(body)
if err := c.awsCfg.SignRequest(req, sigv4Hash); err != nil {
// there is no need in retry, request will be rejected by client.Do and retried by code below
logger.Warnf("cannot sign remoteWrite request with AWS sigv4: %s", err)
return nil, fmt.Errorf("cannot sign remoteWrite request with AWS sigv4: %w", err)
}
}
return c.hc.Do(req)
return req, nil
}
// sendBlockHTTP sends the given block to c.remoteWriteURL.

View File

@@ -87,8 +87,8 @@ func initLabelsGlobal() {
}
}
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, pcs *promrelabel.ParsedConfigs) []prompbmarshal.TimeSeries {
if len(extraLabels) == 0 && pcs.Len() == 0 && !*usePromCompatibleNaming {
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, pcs *promrelabel.ParsedConfigs) []prompbmarshal.TimeSeries {
if pcs.Len() == 0 && !*usePromCompatibleNaming {
// Nothing to change.
return tss
}
@@ -98,34 +98,15 @@ func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLab
ts := &tss[i]
labelsLen := len(labels)
labels = append(labels, ts.Labels...)
// extraLabels must be added before applying relabeling according to https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
for j := range extraLabels {
extraLabel := &extraLabels[j]
tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
if tmp != nil {
tmp.Value = extraLabel.Value
} else {
labels = append(labels, *extraLabel)
}
}
if *usePromCompatibleNaming {
// Replace unsupported Prometheus chars in label names and metric names with underscores.
tmpLabels := labels[labelsLen:]
for j := range tmpLabels {
label := &tmpLabels[j]
if label.Name == "__name__" {
label.Value = promrelabel.SanitizeName(label.Value)
} else {
label.Name = promrelabel.SanitizeName(label.Name)
}
}
}
labels = pcs.Apply(labels, labelsLen)
labels = promrelabel.FinalizeLabels(labels[:labelsLen], labels[labelsLen:])
if len(labels) == labelsLen {
// Drop the current time series, since relabeling removed all the labels.
continue
}
if *usePromCompatibleNaming {
fixPromCompatibleNaming(labels[labelsLen:])
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: ts.Samples,
@@ -135,6 +116,29 @@ func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLab
return tssDst
}
func (rctx *relabelCtx) appendExtraLabels(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label) {
if len(extraLabels) == 0 {
return
}
labels := rctx.labels[:0]
for i := range tss {
ts := &tss[i]
labelsLen := len(labels)
labels = append(labels, ts.Labels...)
for j := range extraLabels {
extraLabel := extraLabels[j]
tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
if tmp != nil {
tmp.Value = extraLabel.Value
} else {
labels = append(labels, extraLabel)
}
}
ts.Labels = labels[labelsLen:]
}
rctx.labels = labels
}
type relabelCtx struct {
// pool for labels, which are used during the relabeling.
labels []prompbmarshal.Label
@@ -159,3 +163,15 @@ func putRelabelCtx(rctx *relabelCtx) {
rctx.labels = rctx.labels[:0]
relabelCtxPool.Put(rctx)
}
func fixPromCompatibleNaming(labels []prompbmarshal.Label) {
// Replace unsupported Prometheus chars in label names and metric names with underscores.
for i := range labels {
label := &labels[i]
if label.Name == "__name__" {
label.Value = promrelabel.SanitizeMetricName(label.Value)
} else {
label.Name = promrelabel.SanitizeLabelName(label.Name)
}
}
}

View File

@@ -10,18 +10,16 @@ import (
)
func TestApplyRelabeling(t *testing.T) {
f := func(extraLabels []prompbmarshal.Label, pcs *promrelabel.ParsedConfigs, sTss, sExpTss string) {
f := func(pcs *promrelabel.ParsedConfigs, sTss, sExpTss string) {
rctx := &relabelCtx{}
tss, expTss := parseSeries(sTss), parseSeries(sExpTss)
gotTss := rctx.applyRelabeling(tss, extraLabels, pcs)
gotTss := rctx.applyRelabeling(tss, pcs)
if !reflect.DeepEqual(gotTss, expTss) {
t.Fatalf("expected to have: \n%v;\ngot: \n%v", expTss, gotTss)
}
}
f(nil, nil, "up", "up")
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, nil, "up", `up{foo="bar"}`)
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, nil, `up{foo="baz"}`, `up{foo="bar"}`)
f(nil, "up", "up")
pcs, err := promrelabel.ParseRelabelConfigsData([]byte(`
- target_label: "foo"
@@ -32,11 +30,33 @@ func TestApplyRelabeling(t *testing.T) {
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
f(nil, pcs, `up{foo="baz", env="prod"}`, `up{foo="aaa"}`)
f(pcs, `up{foo="baz", env="prod"}`, `up{foo="aaa"}`)
oldVal := *usePromCompatibleNaming
*usePromCompatibleNaming = true
f(nil, nil, `foo.bar`, `foo_bar`)
f(nil, `foo.bar`, `foo_bar`)
*usePromCompatibleNaming = oldVal
}
func TestAppendExtraLabels(t *testing.T) {
f := func(extraLabels []prompbmarshal.Label, sTss, sExpTss string) {
t.Helper()
rctx := &relabelCtx{}
tss, expTss := parseSeries(sTss), parseSeries(sExpTss)
rctx.appendExtraLabels(tss, extraLabels)
if !reflect.DeepEqual(tss, expTss) {
t.Fatalf("expected to have: \n%v;\ngot: \n%v", expTss, tss)
}
}
f(nil, "up", "up")
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, "up", `up{foo="bar"}`)
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, `up{foo="baz"}`, `up{foo="bar"}`)
f([]prompbmarshal.Label{{Name: "baz", Value: "qux"}}, `up{foo="baz"}`, `up{foo="baz",baz="qux"}`)
oldVal := *usePromCompatibleNaming
*usePromCompatibleNaming = true
f([]prompbmarshal.Label{{Name: "foo.bar", Value: "baz"}}, "up", `up{foo.bar="baz"}`)
*usePromCompatibleNaming = oldVal
}

View File

@@ -10,8 +10,6 @@ import (
"sync/atomic"
"time"
"github.com/cespare/xxhash/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
@@ -25,9 +23,11 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
"github.com/VictoriaMetrics/metrics"
"github.com/cespare/xxhash/v2"
)
var (
@@ -41,6 +41,9 @@ var (
"Pass multiple -remoteWrite.multitenantURL flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.url")
shardByURL = flag.Bool("remoteWrite.shardByURL", false, "Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . "+
"By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages")
shardByURLLabels = flagutil.NewArrayString("remoteWrite.shardByURL.labels", "Optional list of labels, which must be used for sharding outgoing samples "+
"among remote storage systems if -remoteWrite.shardByURL command-line flag is set. By default all the labels are used for sharding in order to gain "+
"even distribution of series over the specified -remoteWrite.url systems")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+
"See also -remoteWrite.maxDiskUsagePerURL")
keepDanglingQueues = flag.Bool("remoteWrite.keepDanglingQueues", false, "Keep persistent queues contents at -remoteWrite.tmpDataPath in case there are no matching -remoteWrite.url. "+
@@ -49,14 +52,15 @@ var (
"isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
"Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. "+
"Disk usage is unlimited if the value is set to 0")
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", "The number of significant figures to leave in metric values before writing them "+
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", 0, "The number of significant figures to leave in metric values before writing them "+
"to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. "+
"This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits")
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", "Round metric values to this number of decimal digits after the point before writing them to remote storage. "+
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", 100, "Round metric values to this number of decimal digits after the point before "+
"writing them to remote storage. "+
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
"By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
"This option may be used for improving data compression for the stored metrics")
@@ -78,7 +82,7 @@ var (
streamAggrDropInput = flagutil.NewArrayBool("remoteWrite.streamAggr.dropInput", "Whether to drop all the input samples after the aggregation "+
"with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples "+
"are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/stream-aggregation.html")
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", "Input samples are de-duplicated with this interval before being aggregated. "+
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval before being aggregated. "+
"Only the last sample per each time series per each interval is aggregated if the interval is greater than zero")
)
@@ -116,6 +120,8 @@ func InitSecretFlags() {
}
}
var shardByURLLabelsMap map[string]struct{}
// Init initializes remotewrite.
//
// It must be called after flag.Parse().
@@ -152,6 +158,13 @@ func Init() {
if *queues <= 0 {
*queues = 1
}
if len(*shardByURLLabels) > 0 {
m := make(map[string]struct{}, len(*shardByURLLabels))
for _, label := range *shardByURLLabels {
m[label] = struct{}{}
}
shardByURLLabelsMap = m
}
initLabelsGlobal()
// Register SIGHUP handler for config reload before loadRelabelConfigs.
@@ -258,7 +271,7 @@ func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
if *showRemoteWriteURL {
sanitizedURL = fmt.Sprintf("%d:%s", i+1, remoteWriteURL)
}
rwctxs[i] = newRemoteWriteCtx(i, at, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
rwctxs[i] = newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
}
if !*keepDanglingQueues {
@@ -355,7 +368,7 @@ func Push(at *auth.Token, wr *prompbmarshal.WriteRequest) {
var rctx *relabelCtx
rcs := allRelabelConfigs.Load()
pcsGlobal := rcs.global
if pcsGlobal.Len() > 0 || len(labelsGlobal) > 0 {
if pcsGlobal.Len() > 0 {
rctx = getRelabelCtx()
}
tss := wr.Timeseries
@@ -386,7 +399,7 @@ func Push(at *auth.Token, wr *prompbmarshal.WriteRequest) {
}
if rctx != nil {
rowsCountBeforeRelabel := getRowsCount(tssBlock)
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, pcsGlobal)
tssBlock = rctx.applyRelabeling(tssBlock, pcsGlobal)
rowsCountAfterRelabel := getRowsCount(tssBlock)
rowsDroppedByGlobalRelabel.Add(rowsCountBeforeRelabel - rowsCountAfterRelabel)
}
@@ -418,11 +431,23 @@ func pushBlockToRemoteStorages(rwctxs []*remoteWriteCtx, tssBlock []prompbmarsha
if *shardByURL {
// Shard the data among rwctxs
tssByURL := make([][]prompbmarshal.TimeSeries, len(rwctxs))
tmpLabels := promutils.GetLabels()
for _, ts := range tssBlock {
h := getLabelsHash(ts.Labels)
hashLabels := ts.Labels
if len(shardByURLLabelsMap) > 0 {
hashLabels = tmpLabels.Labels[:0]
for _, label := range ts.Labels {
if _, ok := shardByURLLabelsMap[label.Name]; ok {
hashLabels = append(hashLabels, label)
}
}
}
h := getLabelsHash(hashLabels)
idx := h % uint64(len(tssByURL))
tssByURL[idx] = append(tssByURL[idx], ts)
}
promutils.PutLabels(tmpLabels)
// Push sharded data to remote storages in parallel in order to reduce
// the time needed for sending the data to multiple remote storage systems.
var wg sync.WaitGroup
@@ -559,14 +584,14 @@ type remoteWriteCtx struct {
rowsDroppedByRelabel *metrics.Counter
}
func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
func newRemoteWriteCtx(argIdx int, remoteWriteURL *url.URL, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
// strip query params, otherwise changing params resets pq
pqURL := *remoteWriteURL
pqURL.RawQuery = ""
pqURL.Fragment = ""
h := xxhash.Sum64([]byte(pqURL.String()))
queuePath := filepath.Join(*tmpDataPath, persistentQueueDirname, fmt.Sprintf("%d_%016X", argIdx+1, h))
maxPendingBytes := maxPendingBytesPerURL.GetOptionalArgOrDefault(argIdx, 0)
maxPendingBytes := maxPendingBytesPerURL.GetOptionalArg(argIdx)
if maxPendingBytes != 0 && maxPendingBytes < persistentqueue.DefaultChunkFileSize {
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4195
logger.Warnf("rounding the -remoteWrite.maxDiskUsagePerURL=%d to the minimum supported value: %d", maxPendingBytes, persistentqueue.DefaultChunkFileSize)
@@ -590,8 +615,8 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
c.init(argIdx, *queues, sanitizedURL)
// Initialize pss
sf := significantFigures.GetOptionalArgOrDefault(argIdx, 0)
rd := roundDigits.GetOptionalArgOrDefault(argIdx, 100)
sf := significantFigures.GetOptionalArg(argIdx)
rd := roundDigits.GetOptionalArg(argIdx)
pssLen := *queues
if n := cgroup.AvailableCPUs(); pssLen > n {
// There is no sense in running more than availableCPUs concurrent pendingSeries,
@@ -616,7 +641,7 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
// Initialize sas
sasFile := streamAggrConfig.GetOptionalArg(argIdx)
if sasFile != "" {
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(argIdx, 0)
dedupInterval := streamAggrDedupInterval.GetOptionalArg(argIdx)
sas, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
if err != nil {
logger.Fatalf("cannot initialize stream aggregators from -remoteWrite.streamAggr.config=%q: %s", sasFile, err)
@@ -668,7 +693,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
v = tssPool.Get().(*[]prompbmarshal.TimeSeries)
tss = append(*v, tss...)
rowsCountBeforeRelabel := getRowsCount(tss)
tss = rctx.applyRelabeling(tss, nil, pcs)
tss = rctx.applyRelabeling(tss, pcs)
rowsCountAfterRelabel := getRowsCount(tss)
rwctx.rowsDroppedByRelabel.Add(rowsCountBeforeRelabel - rowsCountAfterRelabel)
}
@@ -705,11 +730,13 @@ var matchIdxsPool bytesutil.ByteBufferPool
func dropAggregatedSeries(src []prompbmarshal.TimeSeries, matchIdxs []byte, dropInput bool) []prompbmarshal.TimeSeries {
dst := src[:0]
for i, match := range matchIdxs {
if match == 0 {
continue
if !dropInput {
for i, match := range matchIdxs {
if match == 1 {
continue
}
dst = append(dst, src[i])
}
dst = append(dst, src[i])
}
tail := src[len(dst):]
_ = prompbmarshal.ResetTimeSeries(tail)
@@ -717,22 +744,38 @@ func dropAggregatedSeries(src []prompbmarshal.TimeSeries, matchIdxs []byte, drop
}
func (rwctx *remoteWriteCtx) pushInternal(tss []prompbmarshal.TimeSeries) {
var rctx *relabelCtx
var v *[]prompbmarshal.TimeSeries
if len(labelsGlobal) > 0 {
// Make a copy of tss before adding extra labels in order to prevent
// from affecting time series for other remoteWrite.url configs.
rctx = getRelabelCtx()
v = tssPool.Get().(*[]prompbmarshal.TimeSeries)
tss = append(*v, tss...)
rctx.appendExtraLabels(tss, labelsGlobal)
}
pss := rwctx.pss
idx := atomic.AddUint64(&rwctx.pssNextIdx, 1) % uint64(len(pss))
pss[idx].Push(tss)
if rctx != nil {
*v = prompbmarshal.ResetTimeSeries(tss)
tssPool.Put(v)
putRelabelCtx(rctx)
}
}
func (rwctx *remoteWriteCtx) reinitStreamAggr() {
sas := rwctx.sas.Load()
if sas == nil {
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
if sasFile == "" {
// There is no stream aggregation for rwctx
return
}
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", sasFile)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, sasFile)).Inc()
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(rwctx.idx, 0)
dedupInterval := streamAggrDedupInterval.GetOptionalArg(rwctx.idx)
sasNew, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
if err != nil {
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, sasFile)).Inc()
@@ -740,6 +783,7 @@ func (rwctx *remoteWriteCtx) reinitStreamAggr() {
logger.Errorf("cannot reload stream aggregation config from -remoteWrite.streamAggr.config=%q; continue using the previously loaded config; error: %s", sasFile, err)
return
}
sas := rwctx.sas.Load()
if !sasNew.Equal(sas) {
sasOld := rwctx.sas.Swap(sasNew)
sasOld.MustStop()
@@ -774,7 +818,7 @@ func CheckStreamAggrConfigs() error {
if sasFile == "" {
continue
}
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(idx, 0)
dedupInterval := streamAggrDedupInterval.GetOptionalArg(idx)
sas, err := streamaggr.LoadFromFile(sasFile, pushNoop, dedupInterval)
if err != nil {
return fmt.Errorf("cannot load -remoteWrite.streamAggr.config=%q: %w", sasFile, err)

View File

@@ -27,7 +27,7 @@ var (
stdDialerOnce sync.Once
)
func statDial(ctx context.Context, networkUnused, addr string) (conn net.Conn, err error) {
func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) {
network := netutil.GetTCPNetwork()
d := getStdDialer()
conn, err = d.DialContext(ctx, network, addr)

103
app/vmalert-tool/Makefile Normal file
View File

@@ -0,0 +1,103 @@
# All these commands must run from repository root.
vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) app-local
vmalert-tool-race:
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
vmalert-tool-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker
vmalert-tool-pure-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
vmalert-tool-linux-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
vmalert-tool-linux-arm-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
vmalert-tool-linux-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
vmalert-tool-linux-ppc64le-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
vmalert-tool-linux-386-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
vmalert-tool-darwin-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
vmalert-tool-darwin-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
vmalert-tool-freebsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
vmalert-tool-openbsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
vmalert-tool-windows-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
package-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) package-via-docker
package-vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
package-vmalert-tool-amd64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
package-vmalert-tool-arm:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
package-vmalert-tool-arm64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
package-vmalert-tool-ppc64le:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
package-vmalert-tool-386:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
publish-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
vmalert-tool-linux-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-ppc64le:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
vmalert-tool-linux-s390x:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
vmalert-tool-linux-386:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-freebsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-openbsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-windows-amd64:
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) app-local-pure

246
app/vmalert-tool/README.md Normal file
View File

@@ -0,0 +1,246 @@
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```yaml
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```yaml
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```yaml
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```yaml
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```yaml
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```yaml
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```yaml
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```yaml
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```yaml
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

54
app/vmalert-tool/main.go Normal file
View File

@@ -0,0 +1,54 @@
package main
import (
"fmt"
"log"
"os"
"time"
"github.com/urfave/cli/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
)
func main() {
start := time.Now()
app := &cli.App{
Name: "vmalert-tool",
Usage: "VMAlert command-line tool",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "unittest",
Usage: "Run unittest for alerting and recording rules.",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
Flags: []cli.Flag{
&cli.StringSliceFlag{
Name: "files",
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
Required: true,
},
&cli.BoolFlag{
Name: "disableAlertgroupLabel",
Usage: "disable adding group's Name as label to generated alerts and time series.",
Required: false,
},
},
Action: func(c *cli.Context) error {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
return fmt.Errorf("unittest failed")
}
return nil
},
},
},
}
err := app.Run(os.Args)
if err != nil {
log.Fatalln(err)
}
log.Printf("Total time: %v", time.Since(start))
}

View File

@@ -0,0 +1,12 @@
# See https://medium.com/on-docker/use-multi-stage-builds-to-inject-ca-certs-ad1e8f01de1b
ARG certs_image
ARG root_image
FROM $certs_image as certs
RUN apk update && apk upgrade && apk --update --no-cache add ca-certificates
FROM $root_image
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
EXPOSE 8429
ENTRYPOINT ["/vmalert-tool-prod"]
ARG TARGETARCH
COPY vmalert-tool-linux-${TARGETARCH}-prod ./vmalert-tool-prod

View File

@@ -0,0 +1,19 @@
package unittest
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
// alertTestCase holds alert_rule_test cases defined in test file
type alertTestCase struct {
EvalTime *promutils.Duration `yaml:"eval_time"`
GroupName string `yaml:"groupname"`
Alertname string `yaml:"alertname"`
ExpAlerts []expAlert `yaml:"exp_alerts"`
}
// expAlert holds exp_alerts defined in test file
type expAlert struct {
ExpLabels map[string]string `yaml:"exp_labels"`
ExpAnnotations map[string]string `yaml:"exp_annotations"`
}

View File

@@ -0,0 +1,182 @@
package unittest
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strconv"
"strings"
"time"
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// series holds input_series defined in the test file
type series struct {
Series string `yaml:"series"`
Values string `yaml:"values"`
}
// sequenceValue is an omittable value in a sequence of time series values.
type sequenceValue struct {
Value float64
Omitted bool
}
func httpWrite(address string, r io.Reader) {
resp, err := http.Post(address, "", r)
if err != nil {
logger.Fatalf("failed to send to storage: %v", err)
}
resp.Body.Close()
}
// writeInputSeries send input series to vmstorage and flush them
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
r := testutil.WriteRequest{}
for _, data := range input {
expr, err := metricsql.Parse(data.Series)
if err != nil {
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
}
promvals, err := parseInputValue(data.Values, true)
if err != nil {
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
}
metricExpr, ok := expr.(*metricsql.MetricExpr)
if !ok {
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
}
samples := make([]testutil.Sample, 0, len(promvals))
ts := startStamp
for _, v := range promvals {
if !v.Omitted {
samples = append(samples, testutil.Sample{
Timestamp: ts.UnixMilli(),
Value: v.Value,
})
}
ts = ts.Add(interval.Duration())
}
var ls []testutil.Label
for _, filter := range metricExpr.LabelFilterss[0] {
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
}
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
}
data, err := testutil.Compress(r)
if err != nil {
return fmt.Errorf("failed to compress data: %v", err)
}
// write input series to vm
httpWrite(dst, bytes.NewBuffer(data))
vmstorage.Storage.DebugFlush()
return nil
}
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
var res []sequenceValue
items := strings.Split(input, " ")
reg := regexp.MustCompile(`\D?\d*\D?`)
for _, item := range items {
if item == "stale" {
res = append(res, sequenceValue{Value: decimal.StaleNaN})
continue
}
vals := reg.FindAllString(item, -1)
switch len(vals) {
case 1:
if vals[0] == "_" {
res = append(res, sequenceValue{Omitted: true})
continue
}
v, err := strconv.ParseFloat(vals[0], 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v})
continue
case 2:
p1 := vals[0][:len(vals[0])-1]
v2, err := strconv.ParseInt(vals[1], 10, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
switch option {
case '+':
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v1 + float64(v2)})
case 'x':
for i := int64(0); i <= v2; i++ {
if p1 == "_" {
if i == 0 {
i = 1
}
res = append(res, sequenceValue{Omitted: true})
continue
}
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
if !origin || v1 == 0 {
res = append(res, sequenceValue{Value: v1 * float64(i)})
continue
}
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
newRes, err := parseInputValue(newVal, false)
if err != nil {
return nil, err
}
res = append(res, newRes...)
break
}
default:
return nil, fmt.Errorf("got invalid operation %b", option)
}
case 3:
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
if err != nil {
return nil, err
}
p1 := vals[0][:len(vals[0])-1]
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
var isAdd bool
if option == '+' {
isAdd = true
}
for _, r := range r1 {
if isAdd {
res = append(res, sequenceValue{
Value: r.Value + v1,
})
} else {
res = append(res, sequenceValue{
Value: v1 - r.Value,
})
}
}
default:
return nil, fmt.Errorf("unsupported input %s", input)
}
}
return res, nil
}

View File

@@ -0,0 +1,93 @@
package unittest
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
)
func TestParseInputValue(t *testing.T) {
testCases := []struct {
input string
exp []sequenceValue
failed bool
}{
{
"",
nil,
true,
},
{
"testfailed",
nil,
true,
},
// stale doesn't support operations
{
"stalex3",
nil,
true,
},
{
"-4",
[]sequenceValue{{Value: -4}},
false,
},
{
"_",
[]sequenceValue{{Omitted: true}},
false,
},
{
"stale",
[]sequenceValue{{Value: decimal.StaleNaN}},
false,
},
{
"-4x1",
[]sequenceValue{{Value: -4}, {Value: -4}},
false,
},
{
"_x1",
[]sequenceValue{{Omitted: true}},
false,
},
{
"1+1x4",
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
false,
},
{
"2-1x4",
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
false,
},
{
"1+1x1 _ -4 stale 3+20x1",
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
false,
},
}
for _, tc := range testCases {
output, err := parseInputValue(tc.input, true)
if err != nil != tc.failed {
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
}
if len(tc.exp) != len(output) {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
for i := 0; i < len(tc.exp); i++ {
if tc.exp[i].Omitted != output[i].Omitted {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
if tc.exp[i].Value != output[i].Value {
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
continue
}
t.Fatalf("expect %v, got %v", tc.exp, output)
}
}
}
}

View File

@@ -0,0 +1,100 @@
package unittest
import (
"context"
"fmt"
"net/url"
"reflect"
"sort"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
type metricsqlTestCase struct {
Expr string `yaml:"expr"`
EvalTime *promutils.Duration `yaml:"eval_time"`
ExpSamples []expSample `yaml:"exp_samples"`
}
type expSample struct {
Labels string `yaml:"labels"`
Value float64 `yaml:"value"`
}
// checkMetricsqlCase will check metricsql_expr_test cases
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
Outer:
for _, mt := range cases {
result, _, err := queries.Query(context.Background(), mt.Expr, durationToTime(mt.EvalTime))
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
mt.EvalTime.Duration().String(), err))
continue
}
var gotSamples []parsedSample
for _, s := range result.Data {
sort.Slice(s.Labels, func(i, j int) bool {
return s.Labels[i].Name < s.Labels[j].Name
})
gotSamples = append(gotSamples, parsedSample{
Labels: s.Labels,
Value: s.Values[0],
})
}
var expSamples []parsedSample
for _, s := range mt.ExpSamples {
expLb := datasource.Labels{}
if s.Labels != "" {
metricsqlExpr, err := metricsql.Parse(s.Labels)
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
continue Outer
}
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
if !ok {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
continue Outer
}
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
expLb = append(expLb, datasource.Label{
Name: l.Label,
Value: l.Value,
})
}
}
sort.Slice(expLb, func(i, j int) bool {
return expLb[i].Name < expLb[j].Name
})
expSamples = append(expSamples, parsedSample{
Labels: expLb,
Value: s.Value,
})
}
sort.Slice(expSamples, func(i, j int) bool {
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
})
sort.Slice(gotSamples, func(i, j int) bool {
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
})
if !reflect.DeepEqual(expSamples, gotSamples) {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
}
}
return
}
func durationToTime(pd *promutils.Duration) time.Time {
if pd == nil {
return time.Time{}
}
return time.UnixMilli(pd.Duration().Milliseconds())
}

View File

@@ -0,0 +1,43 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View File

@@ -0,0 +1,49 @@
rule_files:
- rules.yaml
tests:
- interval: 1m
name: "Failing test"
input_series:
- series: test
values: "0"
metricsql_expr_test:
- expr: test
eval_time: 0m
exp_samples:
- value: 0
labels: test
# will failed cause there is no "Test" group and rule defined
alert_rule_test:
- eval_time: 0m
groupname: Test
alertname: Test
exp_alerts:
- exp_labels: {}
- interval: 1m
name: Failing alert test
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause rule is firing
- eval_time: 5m
groupname: group1
alertname: InstanceDown
exp_alerts: []
- interval: 1m
name: Failing alert test with missing groupname
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause missing groupname
- eval_time: 5m
alertname: AlwaysFiring
exp_alerts: []

View File

@@ -0,0 +1,30 @@
# can be executed successfully but will take more than 1 minute
# not included in unit test now
evaluation_interval: 100d
rule_files:
- rules.yaml
tests:
- interval: 1d
input_series:
- series: test
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
# We input series to 2024-01-01T00:00:00 here.
values: "0+1x19723"
metricsql_expr_test:
- expr: timestamp(test)
eval_time: 0m
exp_samples:
- value: 0
- expr: test
eval_time: 100d
exp_samples:
- labels: test
value: 100
- expr: timestamp(test)
eval_time: 19000d
exp_samples:
- value: 1641600000 # 19000d -> seconds.

View File

@@ -0,0 +1,39 @@
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 1m
- name: group2
rules:
- record: t1
expr: test
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 5m
- name: group3
rules:
- record: t2
expr: t1
- name: group4
rules:
- record: t3
expr: t1

View File

@@ -0,0 +1,99 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
group_eval_order: ["group4", "group2", "group3"]
tests:
- interval: 1m
name: "basic test"
input_series:
- series: "test"
values: "_x5 1x5 _ stale"
alert_rule_test:
- eval_time: 1m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- eval_time: 1m
groupname: group2
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
- eval_time: 6m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
metricsql_expr_test:
- expr: test
eval_time: 11m
exp_samples:
- labels: '{__name__="test"}'
value: 1
- expr: test
eval_time: 12m
exp_samples: []
- interval: 1m
name: "basic test2"
input_series:
- series: 'up{job="vmagent1", instance="localhost:9090"}'
values: "0+0x1440"
- series: "test"
values: "0+1x1440"
metricsql_expr_test:
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
eval_time: 4m
exp_samples:
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
value: 1
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
value: 1
- expr: t1
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t1", datacenter="dc-123"}'
- expr: t2
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t2", datacenter="dc-123"}'
- expr: t3
eval_time: 4m
exp_samples:
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
- value: 3
labels: '{__name__="t3", datacenter="dc-123"}'
alert_rule_test:
- eval_time: 10m
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent1
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: alerts
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View File

@@ -0,0 +1,46 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View File

@@ -0,0 +1,83 @@
package unittest
import (
"fmt"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// parsedSample is a sample with parsed Labels
type parsedSample struct {
Labels datasource.Labels
Value float64
}
func (ps *parsedSample) String() string {
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
}
func parsedSamplesString(pss []parsedSample) string {
if len(pss) == 0 {
return "nil"
}
s := pss[0].String()
for _, ps := range pss[1:] {
s += ", " + ps.String()
}
return s
}
// labelAndAnnotation holds labels and annotations
type labelAndAnnotation struct {
Labels datasource.Labels
Annotations datasource.Labels
}
func (la *labelAndAnnotation) String() string {
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
}
// labelsAndAnnotations is collection of LabelAndAnnotation
type labelsAndAnnotations []labelAndAnnotation
func (la labelsAndAnnotations) Len() int { return len(la) }
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
func (la labelsAndAnnotations) Less(i, j int) bool {
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
if diff != 0 {
return diff < 0
}
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
}
func (la labelsAndAnnotations) String() string {
if len(la) == 0 {
return "[]"
}
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
for i, l := range la[1:] {
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
}
s += "\n]"
return s
}
// indentLines prefixes each line in the supplied string with the given "indent" string.
func indentLines(lines, indent string) string {
sb := strings.Builder{}
n := strings.Split(lines, "\n")
for i, l := range n {
if i > 0 {
sb.WriteString(indent)
}
sb.WriteString(l)
if i != len(n)-1 {
sb.WriteRune('\n')
}
}
return sb.String()
}

View File

@@ -0,0 +1,443 @@
package unittest
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"reflect"
"sort"
"time"
"gopkg.in/yaml.v2"
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metrics"
)
var (
storagePath string
httpListenAddr = ":8880"
// insert series from 1970-01-01T00:00:00
testStartTime = time.Unix(0, 0).UTC()
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
disableAlertgroupLabel bool
)
const (
testStoragePath = "vmalert-unittest"
testLogLevel = "ERROR"
)
// UnitTest runs unittest for files
func UnitTest(files []string, disableGroupLabel bool) bool {
if err := templates.Load([]string{}, true); err != nil {
logger.Fatalf("failed to load template: %v", err)
}
storagePath = filepath.Join(os.TempDir(), testStoragePath)
processFlags()
vminsert.Init()
vmselect.Init()
// storagePath will be created again when closing vmselect, so remove it again.
defer fs.MustRemoveAll(storagePath)
defer vminsert.Stop()
defer vmselect.Stop()
disableAlertgroupLabel = disableGroupLabel
return rulesUnitTest(files)
}
func rulesUnitTest(files []string) bool {
var failed bool
for _, f := range files {
if err := ruleUnitTest(f); err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
failed = true
} else {
fmt.Println(" SUCCESS")
}
}
return failed
}
func ruleUnitTest(filename string) []error {
fmt.Println("\nUnit Testing: ", filename)
b, err := os.ReadFile(filename)
if err != nil {
return []error{fmt.Errorf("failed to read file: %w", err)}
}
var unitTestInp unitTestFile
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
}
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
}
if unitTestInp.EvaluationInterval.Duration() == 0 {
fmt.Println("evaluation_interval set to 1m by default")
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
}
groupOrderMap := make(map[string]int)
for i, gn := range unitTestInp.GroupEvalOrder {
if _, ok := groupOrderMap[gn]; ok {
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
}
groupOrderMap[gn] = i
}
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
if err != nil {
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
}
var errs []error
for _, t := range unitTestInp.Tests {
if err := verifyTestGroup(t); err != nil {
errs = append(errs, err)
continue
}
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
errs = append(errs, testErrs...)
}
if len(errs) > 0 {
return errs
}
return nil
}
func verifyTestGroup(group testGroup) error {
var testGroupName string
if group.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
}
for _, at := range group.AlertRuleTests {
if at.Alertname == "" {
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
}
if !disableAlertgroupLabel && at.GroupName == "" {
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
}
if disableAlertgroupLabel && at.GroupName != "" {
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
}
if at.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
for _, et := range group.MetricsqlExprTests {
if et.Expr == "" {
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
}
if et.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
return nil
}
func processFlags() {
flag.Parse()
for _, fv := range []struct {
flag string
value string
}{
{flag: "storageDataPath", value: storagePath},
{flag: "loggerLevel", value: testLogLevel},
{flag: "search.disableCache", value: "true"},
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
{flag: "retentionPeriod", value: "100y"},
{flag: "datasource.url", value: testDataSourcePath},
{flag: "remoteWrite.url", value: testRemoteWritePath},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
}
}
}
func setUp() {
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/prometheus/api/v1/query":
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
case "/prometheus/api/v1/write", "/api/v1/write":
if err := promremotewrite.InsertHandler(r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
default:
}
return false
})
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
readyCheckFunc := func() bool {
resp, err := http.Get(testHealthHTTPPath)
if err != nil {
return false
}
_ = resp.Body.Close()
return resp.StatusCode == 200
}
checkCheck:
for {
select {
case <-ctx.Done():
logger.Fatalf("http server can't be ready in 30s")
default:
if readyCheckFunc() {
break checkCheck
}
time.Sleep(3 * time.Second)
}
}
}
func tearDown() {
if err := httpserver.Stop(httpListenAddr); err != nil {
logger.Errorf("cannot stop the webservice: %s", err)
}
vmstorage.Stop()
metrics.UnregisterAllMetrics()
fs.MustRemoveAll(storagePath)
}
// resolveAndGlobFilepaths joins all relative paths in a configuration
// with a given base directory and replaces all globs with matching files.
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
for i, rf := range utf.RuleFiles {
if rf != "" && !filepath.IsAbs(rf) {
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
}
}
var globbedFiles []string
for _, rf := range utf.RuleFiles {
m, err := filepath.Glob(rf)
if err != nil {
return err
}
if len(m) == 0 {
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
}
globbedFiles = append(globbedFiles, m...)
}
utf.RuleFiles = globbedFiles
return nil
}
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
// set up vmstorage and http server for ingest and read queries
setUp()
// tear down vmstorage and clean the data dir
defer tearDown()
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
if err != nil {
return []error{err}
}
q, err := datasource.Init(nil)
if err != nil {
return []error{fmt.Errorf("failed to init datasource: %v", err)}
}
rw, err := remotewrite.NewDebugClient()
if err != nil {
return []error{fmt.Errorf("failed to init wr: %v", err)}
}
alertEvalTimesMap := map[time.Duration]struct{}{}
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
for _, at := range tg.AlertRuleTests {
et := at.EvalTime.Duration()
alertEvalTimesMap[et] = struct{}{}
if _, ok := alertExpResultMap[et]; !ok {
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
}
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
}
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
}
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
for k := range alertEvalTimesMap {
alertEvalTimes = append(alertEvalTimes, k)
}
sort.Slice(alertEvalTimes, func(i, j int) bool {
return alertEvalTimes[i] < alertEvalTimes[j]
})
// sort group eval order according to the given "group_eval_order".
sort.Slice(testGroups, func(i, j int) bool {
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
})
// create groups with given rule
var groups []*rule.Group
for _, group := range testGroups {
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
groups = append(groups, ng)
}
evalIndex := 0
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
for _, g := range groups {
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
for err := range errs {
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
ts, err))
}
}
// flush series after each group evaluation
vmstorage.Storage.DebugFlush()
}
// check alert_rule_test case at every eval time
for evalIndex < len(alertEvalTimes) {
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
break
}
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
for _, g := range groups {
if disableAlertgroupLabel {
g.Name = ""
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
continue
}
if _, ok := gotAlertsMap[g.Name]; !ok {
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
}
for _, r := range g.Rules {
ar, isAlertRule := r.(*rule.AlertingRule)
if !isAlertRule {
continue
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
for _, got := range ar.GetAlerts() {
if got.State != notifier.StateFiring {
continue
}
if disableAlertgroupLabel {
delete(got.Labels, "alertgroup")
}
laa := labelAndAnnotation{
Labels: datasource.ConvertToLabels(got.Labels),
Annotations: datasource.ConvertToLabels(got.Annotations),
}
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
}
}
}
}
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
for alertname, res := range gres {
var expAlerts labelsAndAnnotations
for _, expAlert := range res {
if expAlert.ExpLabels == nil {
expAlert.ExpLabels = make(map[string]string)
}
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
if !disableAlertgroupLabel {
expAlert.ExpLabels["alertgroup"] = groupname
}
// alertNameLabel is added as additional labels in vmalert.
expAlert.ExpLabels["alertname"] = alertname
expAlerts = append(expAlerts, labelAndAnnotation{
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
})
}
sort.Sort(expAlerts)
gotAlerts := gotAlertsMap[groupname][alertname]
sort.Sort(gotAlerts)
if !reflect.DeepEqual(expAlerts, gotAlerts) {
var testGroupName string
if tg.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
}
expString := indentLines(expAlerts.String(), " ")
gotString := indentLines(gotAlerts.String(), " ")
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
}
}
}
evalIndex++
}
}
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
return checkErrs
}
// unitTestFile holds the contents of a single unit test file
type unitTestFile struct {
RuleFiles []string `yaml:"rule_files"`
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
GroupEvalOrder []string `yaml:"group_eval_order"`
Tests []testGroup `yaml:"tests"`
}
// testGroup is a group of input series and test cases associated with it
type testGroup struct {
Interval *promutils.Duration `yaml:"interval"`
InputSeries []series `yaml:"input_series"`
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
ExternalLabels map[string]string `yaml:"external_labels"`
TestGroupName string `yaml:"name"`
}
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
func (tg *testGroup) maxEvalTime() time.Duration {
var maxd time.Duration
for _, alert := range tg.AlertRuleTests {
if alert.EvalTime.Duration() > maxd {
maxd = alert.EvalTime.Duration()
}
}
for _, met := range tg.MetricsqlExprTests {
if met.EvalTime.Duration() > maxd {
maxd = met.EvalTime.Duration()
}
}
return maxd
}

View File

@@ -0,0 +1,47 @@
package unittest
import (
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUnitRule(t *testing.T) {
testCases := []struct {
name string
disableGroupLabel bool
files []string
failed bool
}{
{
name: "run multi files",
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
failed: false,
},
{
name: "disable group label",
disableGroupLabel: true,
files: []string{"./testdata/disable-group-label.yaml"},
failed: false,
},
{
name: "failing test",
files: []string{"./testdata/failed-test.yaml"},
failed: true,
},
}
for _, tc := range testCases {
fail := UnitTest(tc.files, tc.disableGroupLabel)
if fail != tc.failed {
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
}
}
}

View File

@@ -112,18 +112,41 @@ name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = -evaluationInterval flag ]
# Optional
# Group will be evaluated at the exact offset in the range of [0...interval].
# E.g. for Group with `interval: 1h` and `eval_offset: 5m` the evaluation will
# start at 5th minute of the hour. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409
# `eval_offset` can't be bigger than `interval`.
[ eval_offset: <duration> ]
# Optional
# Adjust the `time` parameter of group evaluation requests to compensate intentional query delay from the datasource.
# By default, the value is inherited from the `-rule.evalDelay` cmd-line flag - see its description for details.
# If group has `latency_offset` set in `params`, then it is recommended to set `eval_delay` equal to `latency_offset`.
# See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155 and https://docs.victoriametrics.com/keyConcepts.html#query-latency.
[ eval_delay: <duration> ]
# Limit the number of alerts an alerting rule and series a recording
# rule can produce. 0 is no limit.
[ limit: <int> | default = 0 ]
# How many rules execute at once within a group. Increasing concurrency may speed
# up round execution speed.
# up group's evaluation duration (exposed via `vmalert_iteration_duration_seconds` metric).
[ concurrency: <integer> | default = 1 ]
# Optional type for expressions inside the rules. Supported values: "graphite" and "prometheus".
# By default "prometheus" type is used.
# By default, "prometheus" type is used.
[ type: <string> ]
# Optional
# The evaluation timestamp will be aligned with group's interval,
# instead of using the actual timestamp that evaluation happens at.
# By default, it's enabled to get more predictable results
# and to visually align with results plotted via Grafana or vmui.
# See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049
# Available starting from v1.95
[ eval_alignment: <bool> | default true]
# Optional list of HTTP URL parameters
# applied for all rules requests within a group
# For example:
@@ -423,7 +446,7 @@ If `-clusterMode` is enabled and the `tenant` in a particular group is missing,
is obtained from `-defaultTenant.prometheus` or `-defaultTenant.graphite` depending on the `type` of the group.
The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise`
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) and in `*-enterprise`
tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
### Reading rules from object storage
@@ -519,14 +542,20 @@ Alertmanagers.
To avoid recording rules results and alerts state duplication in VictoriaMetrics server
don't forget to configure [deduplication](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).
The recommended value for `-dedup.minScrapeInterval` must be multiple of vmalert's `evaluation_interval`.
If you observe inconsistent or "jumping" values in series produced by vmalert, try disabling `-datasource.queryTimeAlignment`
command line flag. Because of alignment, two or more vmalert HA pairs will produce results with the same timestamps.
But due of backfilling (data delivered to the datasource with some delay) values of such results may differ,
which would affect deduplication logic and result into "jumping" datapoints.
Multiple equally configured vmalerts should evaluate rules at the same timestamps, so it is recommended
to set `-dedup.minScrapeInterval` as equal to vmalert's `-evaluationInterval`.
If you have multiple different `interval` params for distinct rule groups, then set `-dedup.minScrapeInterval` to
the biggest `interval` value, or value which will be a multiple for all `interval` values. For example, if you have
two groups with `interval: 10s` and `interval: 15s`, then set `-dedup.minScrapeInterval=30s`. This would consistently
keep only a single data point on 30s time interval for all rules. However, try to avoid having inconsistent `interval`
values.
It is not recommended having `-dedup.minScrapeInterval` smaller than `-evaluationInterval`, as it may produce
results with inconsistent intervals between data points.
Alertmanager will automatically deduplicate alerts with identical labels, so ensure that
all `vmalert`s are having the same config.
all `vmalert`s are having identical config.
Don't forget to configure [cluster mode](https://prometheus.io/docs/alerting/latest/alertmanager/)
for Alertmanagers for better reliability. List all Alertmanager URLs in vmalert `-notifier.url`
@@ -742,6 +771,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
@@ -771,15 +805,13 @@ may get empty response from the datasource and produce empty recording rules or
Try the following recommendations to reduce the chance of hitting the data delay issue:
* Always configure group's `evaluationInterval` to be bigger or at least equal to
* Always configure group's `-evaluationInterval` to be bigger or at least equal to
[time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution);
* Ensure that `[duration]` value is at least twice bigger than
[time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution). For example,
if expression is `rate(my_metric[2m]) > 0` then ensure that `my_metric` resolution is at least `1m` or better `30s`.
If you use VictoriaMetrics as datasource, `[duration]` can be omitted and VictoriaMetrics will adjust it automatically.
* If you know in advance, that data in datasource is delayed - try changing vmalerts `-datasource.lookback`
command-line flag to add a time shift for evaluations. Or extend `[duration]` to tolerate the delay.
For example, `max_over_time(errors_total[10m]) > 0` will be active even if there is no data in datasource for last `9m`.
* Extend `[duration]` in expr to help tolerate the delay. For example, `max_over_time(errors_total[10m]) > 0` will be active even if there is no data in datasource for last `9m`.
* If [time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution)
in datasource is inconsistent or `>=5min` - try changing vmalerts `-datasource.queryStep` command-line flag to specify
how far search query can lookback for the recent datapoint. The recommendation is to have the step
@@ -787,9 +819,12 @@ at least two times bigger than the resolution.
> Please note, data delay is inevitable in distributed systems. And it is better to account for it instead of ignoring.
By default, recently written samples to VictoriaMetrics aren't visible for queries for up to 30s
(see `-search.latencyOffset` command-line flag at vmselect). Such delay is needed to eliminate risk of incomplete
data on the moment of querying, since metrics collectors won't be able to deliver the data in time.
By default, recently written samples to VictoriaMetrics [aren't visible for queries](https://docs.victoriametrics.com/keyConcepts.html#query-latency)
for up to 30s (see `-search.latencyOffset` command-line flag at vmselect or VictoriaMetrics single-node). Such delay is needed to eliminate risk of
incomplete data on the moment of querying, due to chance that metrics collectors won't be able to deliver that data in time.
To compensate the latency in timestamps for produced evaluation results, `-rule.evalDelay` is also set to `30s` by default.
If you expect data to be delayed for longer intervals (it gets buffered, queued, or just network is slow sometimes)
- consider increasing the `-rule.evalDelay` value accordingly.
### Alerts state
@@ -813,7 +848,8 @@ and check the `Last updates` section:
Rows in the section represent ordered rule evaluations and their results. The column `curl` contains an example of
HTTP request sent by vmalert to the `-datasource.url` during evaluation. If specific state shows that there were
no samples returned and curl command returns data - then it is very likely there was no data in datasource on the
moment when rule was evaluated.
moment when rule was evaluated. Sensitive info is stripped from the `curl` examples - see [security](#security) section
for more details.
### Debug mode
@@ -829,6 +865,8 @@ Just set `debug: true` in rule's configuration and vmalert will start printing a
2022-09-15T13:36:56.153Z DEBUG rule "TestGroup":"Conns" (2601299393013563564) at 2022-09-15T15:36:56+02:00: alert 10705778000901301787 {alertgroup="TestGroup",alertname="Conns",cluster="east-1",instance="localhost:8429",replica="a"} PENDING => FIRING: 1m0s since becoming active at 2022-09-15 15:35:56.126006 +0200 CEST m=+39.384575417
```
Sensitive info is stripped from the `curl` examples - see [security](#security) section for more details.
### Never-firing alerts
vmalert can detect if alert's expression doesn't match any time series in runtime
@@ -873,6 +911,20 @@ The same issue can be caused by collision of configured `labels` on [Group](#gro
To fix it one should avoid collisions by carefully picking label overrides in configuration.
## Security
See general recommendations regarding security [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#security).
vmalert [web UI](#web) exposes configuration details such as list of [Groups](#groups), active alerts,
[alerts state](#alerts-state), [notifiers](#notifier-configuration-file). Notifier addresses (sanitized) are attached
as labels to metrics `vmalert_alerts_sent_.*` on `http://<vmalert>/metrics` page. Consider limiting user's access
to the web UI or `/metrics` page if this information is sensitive.
[Alerts state](#alerts-state) page or [debug mode](#debug-mode) could emit additional information about configured
datasource URL, GET params and headers. Sensitive information such as passwords or auth tokens is stripped by default.
To disable stripping of such info pass `-datasource.showURL` cmd-line flag to vmalert.
## Profiling
`vmalert` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
@@ -913,7 +965,7 @@ The shortlist of configuration flags is the following:
{% raw %}
```console
-clusterMode
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-configCheckInterval duration
Interval for checking for changes in '-rule' or '-notifier.config' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.
-datasource.appendTypePrefix
@@ -935,7 +987,7 @@ The shortlist of configuration flags is the following:
-datasource.headers string
Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. For example, -datasource.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -datasource.url. Multiple headers must be delimited by '^^': -datasource.headers='header1:value1^^header2:value2'
-datasource.lookback duration
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
Will be deprecated soon, please adjust "-search.latencyOffset" at datasource side or specify "latency_offset" in rule group's params. Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
-datasource.maxIdleConnections int
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
-datasource.oauth2.clientID string
@@ -951,11 +1003,11 @@ The shortlist of configuration flags is the following:
-datasource.queryStep duration
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
-datasource.queryTimeAlignment
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
Deprecated: please use "eval_alignment" in rule group instead. Whether to align "time" parameter with evaluation interval. Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.showURL
Whether to show -datasource.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
-datasource.tlsCertFile string
@@ -969,9 +1021,9 @@ The shortlist of configuration flags is the following:
-datasource.url string
Datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect URL. Required parameter. E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend and -datasource.showURL
-defaultTenant.graphite string
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-defaultTenant.prometheus string
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-disableAlertgroupLabel
Whether to disable adding group's Name as label to generated alerts and time series.
-dryRun
@@ -983,7 +1035,7 @@ The shortlist of configuration flags is the following:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
-external.alert.source string
@@ -993,6 +1045,8 @@ The shortlist of configuration flags is the following:
Supports an array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.
-filestream.disableFadvise
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
@@ -1001,6 +1055,12 @@ The shortlist of configuration flags is the following:
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.header.csp string
Value for 'Content-Security-Policy' header
-http.header.frameOptions string
Value for 'X-Frame-Options' header
-http.header.hsts string
Value for 'Strict-Transport-Security' header
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -1023,6 +1083,12 @@ The shortlist of configuration flags is the following:
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
-internStringMaxLen int
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
-license string
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
-license.forceOffline
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
-licenseFile string
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
@@ -1061,6 +1127,8 @@ The shortlist of configuration flags is the following:
-notifier.bearerTokenFile array
Optional path to bearer token file for -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.blackhole -notifier.url
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). -notifier.url, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
-notifier.config string
Path to configuration file for notifiers
-notifier.oauth2.clientID array
@@ -1078,6 +1146,8 @@ The shortlist of configuration flags is the following:
-notifier.oauth2.tokenUrl array
Optional OAuth2 tokenURL to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.showURL
Whether to avoid stripping sensitive information such as passwords from URL in log messages or UI for -notifier.url. It is hidden by default, since it can contain sensitive info such as auth key
-notifier.suppressDuplicateTargetErrors
Whether to suppress 'duplicate target' errors during discovery
-notifier.tlsCAFile array
@@ -1098,8 +1168,6 @@ The shortlist of configuration flags is the following:
-notifier.url array
Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.
Supports an array of values separated by comma or specified via multiple flags.
-notifier.blackhole bool
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). `-notifier.url`, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
-pprofAuthKey string
Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings
-promscrape.consul.waitTime duration
@@ -1173,7 +1241,7 @@ The shortlist of configuration flags is the following:
-remoteWrite.bearerTokenFile string
Optional path to bearer token file to use for -remoteWrite.url.
-remoteWrite.concurrency int
Defines number of writers for concurrent writing into remote querier (default 1)
Defines number of writers for concurrent writing into remote write endpoint (default 1)
-remoteWrite.disablePathAppend
Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.
-remoteWrite.flushInterval duration
@@ -1243,13 +1311,14 @@ The shortlist of configuration flags is the following:
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
Supports an array of values separated by comma or specified via multiple flags.
-rule.evalDelay time
Adjustment of the time parameter for rule evaluation requests to compensate intentional data delay from the datasource.Normally, should be equal to `-search.latencyOffset` (cmd-line flag configured for VictoriaMetrics single-node or vmselect). (default 30s)
-rule.maxResolveDuration duration
Limits the maximum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group.
Limits the maxiMum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group
-rule.resendDelay duration
Minimum amount of time to wait before resending an alert to notifier
MiniMum amount of time to wait before resending an alert to notifier
-rule.templates array
Path or glob pattern to location with go template definitions
for rules annotations templating. Flag can be specified multiple times.
Path or glob pattern to location with go template definitions for rules annotations templating. Flag can be specified multiple times.
Examples:
-rule.templates="/path/to/file". Path to a single file with go templates
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
@@ -1263,22 +1332,18 @@ The shortlist of configuration flags is the following:
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Whether to validate annotation and label templates (default true)
-s2a_enable_appengine_dialer
If true, opportunistically use AppEngine-specific dialer to call S2A.
-s2a_timeout duration
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
-s3.configFilePath string
Path to file with S3 configs. Configs are loaded from default location if not set.
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.configProfile string
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.credsFilePath string
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.customEndpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
-s3.forcePathStyle
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default true)
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default true)
-tls
Whether to enable TLS for incoming HTTP requests at -httpListenAddr (aka https). -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
@@ -1342,8 +1407,11 @@ For example:
```yaml
static_configs:
- targets:
# support using full url
- 'http://alertmanager:9093/test/api/v2/alerts'
- 'https://alertmanager:9093/api/v2/alerts'
# the following target with only host:port will be used as <scheme>://localhost:9093/<path_prefix>/api/v2/alerts
- localhost:9093
- localhost:9095
consul_sd_configs:
- server: localhost:8500
@@ -1468,7 +1536,7 @@ software. Please keep simplicity as the main priority.
## How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
* `vmalert` is located in `vmutils-*` archives there.
@@ -1494,7 +1562,7 @@ spec:
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make vmalert` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert` binary and puts it into the `bin` folder.
@@ -1510,7 +1578,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
1. Run `make vmalert-linux-arm` or `make vmalert-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert-linux-arm` or `vmalert-linux-arm64` binary respectively and puts it into the `bin` folder.

View File

@@ -19,10 +19,14 @@ import (
// Group contains list of Rules grouped into
// entity with one name and evaluation interval
type Group struct {
Type Type `yaml:"type,omitempty"`
File string
Name string `yaml:"name"`
Interval *promutils.Duration `yaml:"interval,omitempty"`
Type Type `yaml:"type,omitempty"`
File string
Name string `yaml:"name"`
Interval *promutils.Duration `yaml:"interval,omitempty"`
EvalOffset *promutils.Duration `yaml:"eval_offset,omitempty"`
// EvalDelay will adjust the `time` parameter of rule evaluation requests to compensate intentional query delay from datasource.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155
EvalDelay *promutils.Duration `yaml:"eval_delay,omitempty"`
Limit int `yaml:"limit,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
@@ -38,6 +42,8 @@ type Group struct {
Headers []Header `yaml:"headers,omitempty"`
// NotifierHeaders contains optional HTTP headers sent to notifiers for generated notifications
NotifierHeaders []Header `yaml:"notifier_headers,omitempty"`
// EvalAlignment will make the timestamp of group query requests be aligned with interval
EvalAlignment *bool `yaml:"eval_alignment,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
@@ -63,11 +69,27 @@ func (g *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
return nil
}
// Validate check for internal Group or Rule configuration errors
// Validate checks configuration errors for group and internal rules
func (g *Group) Validate(validateTplFn ValidateTplFn, validateExpressions bool) error {
if g.Name == "" {
return fmt.Errorf("group name must be set")
}
if g.Interval.Duration() < 0 {
return fmt.Errorf("interval shouldn't be lower than 0")
}
if g.EvalOffset.Duration() < 0 {
return fmt.Errorf("eval_offset shouldn't be lower than 0")
}
// if `eval_offset` is set, interval won't use global evaluationInterval flag and must bigger than offset.
if g.EvalOffset.Duration() > g.Interval.Duration() {
return fmt.Errorf("eval_offset should be smaller than interval; now eval_offset: %v, interval: %v", g.EvalOffset.Duration(), g.Interval.Duration())
}
if g.Limit < 0 {
return fmt.Errorf("invalid limit %d, shouldn't be less than 0", g.Limit)
}
if g.Concurrency < 0 {
return fmt.Errorf("invalid concurrency %d, shouldn't be less than 0", g.Concurrency)
}
uniqueRules := map[uint64]struct{}{}
for _, r := range g.Rules {
@@ -76,26 +98,26 @@ func (g *Group) Validate(validateTplFn ValidateTplFn, validateExpressions bool)
ruleName = r.Alert
}
if _, ok := uniqueRules[r.ID]; ok {
return fmt.Errorf("%q is a duplicate within the group %q", r.String(), g.Name)
return fmt.Errorf("%q is a duplicate in group", r.String())
}
uniqueRules[r.ID] = struct{}{}
if err := r.Validate(); err != nil {
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
return fmt.Errorf("invalid rule %q: %w", ruleName, err)
}
if validateExpressions {
// its needed only for tests.
// because correct types must be inherited after unmarshalling.
exprValidator := g.Type.ValidateExpr
if err := exprValidator(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
return fmt.Errorf("invalid expression for rule %q: %w", ruleName, err)
}
}
if validateTplFn != nil {
if err := validateTplFn(r.Annotations); err != nil {
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
return fmt.Errorf("invalid annotations for rule %q: %w", ruleName, err)
}
if err := validateTplFn(r.Labels); err != nil {
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
return fmt.Errorf("invalid labels for rule %q: %w", ruleName, err)
}
}
}
@@ -214,7 +236,7 @@ func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExp
files, err := readFromFS(pathPatterns)
if err != nil {
return nil, fmt.Errorf("failed to read from the config: %s", err)
return nil, fmt.Errorf("failed to read from the config: %w", err)
}
return parse(files, validateTplFn, validateExpressions)
}
@@ -223,11 +245,11 @@ func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExp
func Parse(pathPatterns []string, validateTplFn ValidateTplFn, validateExpressions bool) ([]Group, error) {
files, err := readFromFS(pathPatterns)
if err != nil {
return nil, fmt.Errorf("failed to read from the config: %s", err)
return nil, fmt.Errorf("failed to read from the config: %w", err)
}
groups, err := parse(files, validateTplFn, validateExpressions)
if err != nil {
return nil, fmt.Errorf("failed to parse %s: %s", pathPatterns, err)
return nil, fmt.Errorf("failed to parse %s: %w", pathPatterns, err)
}
if len(groups) < 1 {
cLogger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))

View File

@@ -68,6 +68,10 @@ func TestParseBad(t *testing.T) {
path []string
expErr string
}{
{
[]string{"testdata/rules/rules_interval_bad.rules"},
"eval_offset should be smaller than interval",
},
{
[]string{"testdata/rules/rules0-bad.rules"},
"unexpected token",
@@ -102,7 +106,7 @@ func TestParseBad(t *testing.T) {
},
{
[]string{"http://unreachable-url"},
"failed to read",
"failed to",
},
}
for _, tc := range testCases {
@@ -141,6 +145,35 @@ func TestGroup_Validate(t *testing.T) {
group: &Group{},
expErr: "group name must be set",
},
{
group: &Group{
Name: "negative interval",
Interval: promutils.NewDuration(-1),
},
expErr: "interval shouldn't be lower than 0",
},
{
group: &Group{
Name: "wrong eval_offset",
Interval: promutils.NewDuration(time.Minute),
EvalOffset: promutils.NewDuration(2 * time.Minute),
},
expErr: "eval_offset should be smaller than interval",
},
{
group: &Group{
Name: "wrong limit",
Limit: -1,
},
expErr: "invalid limit",
},
{
group: &Group{
Name: "wrong concurrency",
Concurrency: -1,
},
expErr: "invalid concurrency",
},
{
group: &Group{
Name: "test",

View File

@@ -49,7 +49,7 @@ func (fs *FS) Read(files []string) (map[string][]byte, error) {
path, resp.StatusCode, http.StatusOK, data)
}
if err != nil {
return nil, fmt.Errorf("cannot read %q: %s", path, err)
return nil, fmt.Errorf("cannot read %q: %w", path, err)
}
result[path] = data
}

View File

@@ -15,6 +15,7 @@ groups:
interval: 2s
concurrency: 2
type: prometheus
eval_delay: 30s
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by (instance) > 1

View File

@@ -0,0 +1,13 @@
groups:
- name: groupTest
## default interval is 1min, eval_offset shouldn't be greater than interval
eval_offset: 2m
rules:
- alert: VMRows
for: 2s
expr: sum(rate(vm_http_request_errors_total[2s])) > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"

View File

@@ -0,0 +1,131 @@
package datasource
import (
"context"
"net/http"
"sync"
"time"
)
// FakeQuerier is a mock querier that return predefined results and error message
type FakeQuerier struct {
sync.Mutex
metrics []Metric
err error
}
// SetErr sets query error message
func (fq *FakeQuerier) SetErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
// Reset reset querier's error message and results
func (fq *FakeQuerier) Reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
// Add appends metrics to querier result metrics
func (fq *FakeQuerier) Add(metrics ...Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
// BuildWithParams return FakeQuerier itself
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
return fq
}
// QueryRange performs query
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return Result{}, nil, fq.err
}
cp := make([]Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithRegistry can store different results for different query expr
type FakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]Metric
}
// Set stores query result for given key
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
// Reset clean querier's results registry
func (fqr *FakeQuerierWithRegistry) Reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
// BuildWithParams returns itself
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
return fqr
}
// QueryRange performs query
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier registry
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return Result{}, req, nil
}
cp := make([]Metric, len(metrics))
copy(cp, metrics)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithDelay mock querier with given delay duration
type FakeQuerierWithDelay struct {
FakeQuerier
Delay time.Duration
}
// Query returns query result after delay duration
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
timer := time.NewTimer(fqd.Delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.FakeQuerier.Query(ctx, expr, ts)
}
// BuildWithParams returns itself
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
return fqd
}

View File

@@ -10,13 +10,14 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
var (
addr = flag.String("datasource.url", "", "Datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect URL. Required parameter. "+
"E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend and -datasource.showURL")
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
showDatasourceURL = flag.Bool("datasource.showURL", false, "Whether to show -datasource.url in the exported metrics. "+
showDatasourceURL = flag.Bool("datasource.showURL", false, "Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
headers = flag.String("datasource.headers", "", "Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. "+
@@ -42,12 +43,16 @@ var (
oauth2TokenURL = flag.String("datasource.oauth2.tokenUrl", "", "Optional OAuth2 tokenURL to use for -datasource.url.")
oauth2Scopes = flag.String("datasource.oauth2.scopes", "", "Optional OAuth2 scopes to use for -datasource.url. Scopes must be delimited by ';'")
lookBack = flag.Duration("datasource.lookback", 0, `Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
lookBack = flag.Duration("datasource.lookback", 0, `Will be deprecated soon, please adjust "-search.latencyOffset" at datasource side `+
`or specify "latency_offset" in rule group's params. Lookback defines how far into the past to look when evaluating queries. `+
`For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
queryStep = flag.Duration("datasource.queryStep", 5*time.Minute, "How far a value can fallback to when evaluating queries. "+
"For example, if -datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query. "+
"If set to 0, rule's evaluation interval will be used instead.")
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Whether to align "time" parameter with evaluation interval.`+
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Deprecated: please use "eval_alignment" in rule group instead. `+
`Whether to align "time" parameter with evaluation interval. `+
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. "+
"See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
disableKeepAlive = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)
@@ -62,6 +67,11 @@ func InitSecretFlags() {
}
}
// ShowDatasourceURL whether to show -datasource.url with sensitive information
func ShowDatasourceURL() bool {
return *showDatasourceURL
}
// Param represents an HTTP GET param
type Param struct {
Key, Value string
@@ -74,6 +84,12 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
if *addr == "" {
return nil, fmt.Errorf("datasource.url is empty")
}
if !*queryTimeAlignment {
logger.Warnf("flag `-datasource.queryTimeAlignment` is deprecated and will be removed in next releases. Please use `eval_alignment` in rule group instead.")
}
if *lookBack != 0 {
logger.Warnf("flag `-datasource.lookback` will be deprecated soon. Please use `-rule.evalDelay` command-line flag instead. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155 for details.")
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
@@ -100,6 +116,10 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
if err != nil {
return nil, fmt.Errorf("failed to configure auth: %w", err)
}
_, err = authCfg.GetAuthHeader()
if err != nil {
return nil, fmt.Errorf("failed to set request auth header to datasource %q: %w", *addr, err)
}
return &VMStorage{
c: &http.Client{Transport: tr},

View File

@@ -37,11 +37,14 @@ type VMStorage struct {
appendTypePrefix bool
lookBack time.Duration
queryStep time.Duration
dataSourceType datasourceType
dataSourceType datasourceType
// evaluationInterval will help setting request's `step` param.
evaluationInterval time.Duration
extraParams url.Values
extraHeaders []keyValue
// extraParams contains params to be attached to each HTTP request
extraParams url.Values
// extraHeaders are headers to be attached to each HTTP request
extraHeaders []keyValue
// whether to print additional log messages
// for each sent request
@@ -91,8 +94,15 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
s.extraParams = url.Values{}
}
for k, vl := range params.QueryParams {
for _, v := range vl { // custom query params are prior to default ones
s.extraParams.Set(k, v)
// custom query params are prior to default ones
if s.extraParams.Has(k) {
s.extraParams.Del(k)
}
for _, v := range vl {
// don't use .Set() instead of Del/Add since it is allowed
// for GET params to be duplicated
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4908
s.extraParams.Add(k, v)
}
}
}
@@ -127,33 +137,35 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati
// Query executes the given query and returns parsed response
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error) {
req, err := s.newRequestPOST()
req, err := s.newQueryRequest(query, ts)
if err != nil {
return Result{}, nil, err
}
switch s.dataSourceType {
case "", datasourcePrometheus:
s.setPrometheusInstantReqParams(req, query, ts)
case datasourceGraphite:
s.setGraphiteReqParams(req, query, ts)
default:
return Result{}, nil, fmt.Errorf("engine not found: %q", s.dataSourceType)
}
resp, err := s.do(ctx, req)
if err != nil {
return Result{}, req, err
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
// Return unexpected error to the caller.
return Result{}, nil, err
}
// Something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req, err = s.newQueryRequest(query, ts)
if err != nil {
return Result{}, nil, fmt.Errorf("second attempt: %w", err)
}
resp, err = s.do(ctx, req)
if err != nil {
return Result{}, nil, fmt.Errorf("second attempt: %w", err)
}
}
defer func() {
_ = resp.Body.Close()
}()
// Process the received response.
parseFn := parsePrometheusResponse
if s.dataSourceType != datasourcePrometheus {
parseFn = parseGraphiteResponse
}
result, err := parseFn(req, resp)
_ = resp.Body.Close()
return result, req, err
}
@@ -164,56 +176,96 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
if s.dataSourceType != datasourcePrometheus {
return res, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType)
}
req, err := s.newRequestPOST()
if err != nil {
return res, err
}
if start.IsZero() {
return res, fmt.Errorf("start param is missing")
}
if end.IsZero() {
return res, fmt.Errorf("end param is missing")
}
s.setPrometheusRangeReqParams(req, query, start, end)
resp, err := s.do(ctx, req)
req, err := s.newQueryRangeRequest(query, start, end)
if err != nil {
return res, err
}
defer func() {
_ = resp.Body.Close()
}()
return parsePrometheusResponse(req, resp)
resp, err := s.do(ctx, req)
if err != nil {
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
// Return unexpected error to the caller.
return res, err
}
// Something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req, err = s.newQueryRangeRequest(query, start, end)
if err != nil {
return res, fmt.Errorf("second attempt: %w", err)
}
resp, err = s.do(ctx, req)
if err != nil {
return res, fmt.Errorf("second attempt: %w", err)
}
}
// Process the received response.
res, err = parsePrometheusResponse(req, resp)
_ = resp.Body.Close()
return res, err
}
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
ru := req.URL.Redacted()
if *showDatasourceURL {
ru = req.URL.String()
}
if s.debug {
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery)
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, ru)
}
resp, err := s.c.Do(req.WithContext(ctx))
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
resp, err = s.c.Do(req.WithContext(ctx))
}
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)
return nil, fmt.Errorf("error getting response from %s: %w", ru, err)
}
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL.Redacted(), body)
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, ru, body)
}
return resp, nil
}
func (s *VMStorage) newRequestPOST() (*http.Request, error) {
func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query_range request to datasource %q: %w", s.datasourceURL, err)
}
s.setPrometheusRangeReqParams(req, query, start, end)
return req, nil
}
func (s *VMStorage) newQueryRequest(query string, ts time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query request to datasource %q: %w", s.datasourceURL, err)
}
switch s.dataSourceType {
case "", datasourcePrometheus:
s.setPrometheusInstantReqParams(req, query, ts)
case datasourceGraphite:
s.setGraphiteReqParams(req, query, ts)
default:
logger.Panicf("BUG: engine not found: %q", s.dataSourceType)
}
return req, nil
}
func (s *VMStorage) newRequest() (*http.Request, error) {
req, err := http.NewRequest(http.MethodPost, s.datasourceURL, nil)
if err != nil {
return nil, err
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", s.datasourceURL, err)
}
req.Header.Set("Content-Type", "application/json")
if s.authCfg != nil {
s.authCfg.SetHeaders(req, true)
err = s.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
}
for _, h := range s.extraHeaders {
req.Header.Set(h.key, h.value)

View File

@@ -112,14 +112,14 @@ func parsePrometheusResponse(req *http.Request, resp *http.Response) (res Result
return res, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL.Redacted(), r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return res, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
return res, fmt.Errorf("unknown status: %s, Expected success or error", r.Status)
}
var parseFn func() ([]Metric, error)
switch r.Data.ResultType {
case rtVector:
var pi promInstant
if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
return res, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
return res, fmt.Errorf("unmarshal err %w; \n %#v", err, string(r.Data.Result))
}
parseFn = pi.metrics
case rtMatrix:
@@ -164,10 +164,6 @@ func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string,
if s.lookBack > 0 {
timestamp = timestamp.Add(-s.lookBack)
}
if *queryTimeAlignment && s.evaluationInterval > 0 {
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
timestamp = timestamp.Truncate(s.evaluationInterval)
}
q.Set("time", timestamp.Format(time.RFC3339))
if !*disableStepParam && s.evaluationInterval > 0 { // set step as evaluationInterval by default
// always convert to seconds to keep compatibility with older

View File

@@ -506,8 +506,7 @@ func TestRequestParams(t *testing.T) {
},
func(t *testing.T, r *http.Request) {
evalInterval := 15 * time.Second
tt := timestamp.Truncate(evalInterval)
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {tt.Format(time.RFC3339)}}
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {timestamp.Format(time.RFC3339)}}
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
},
},
@@ -521,7 +520,6 @@ func TestRequestParams(t *testing.T) {
func(t *testing.T, r *http.Request) {
evalInterval := 15 * time.Second
tt := timestamp.Add(-time.Minute)
tt = tt.Truncate(evalInterval)
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {tt.Format(time.RFC3339)}}
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
},
@@ -549,8 +547,7 @@ func TestRequestParams(t *testing.T) {
},
func(t *testing.T, r *http.Request) {
evalInterval := 3 * time.Hour
tt := timestamp.Truncate(evalInterval)
exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {tt.Format(time.RFC3339)}}
exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {timestamp.Format(time.RFC3339)}}
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
},
},
@@ -596,6 +593,17 @@ func TestRequestParams(t *testing.T) {
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
},
},
{
"allow duplicates in query params",
false,
storage.Clone().ApplyParams(QuerierParams{
QueryParams: url.Values{"extra_labels": {"env=dev", "foo=bar"}},
}),
func(t *testing.T, r *http.Request) {
exp := url.Values{"query": {query}, "round_digits": {"10"}, "extra_labels": {"env=dev", "foo=bar"}, "time": {timestamp.Format(time.RFC3339)}}
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
},
},
{
"graphite extra params",
false,
@@ -629,9 +637,9 @@ func TestRequestParams(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
req, err := tc.vm.newRequestPOST()
req, err := tc.vm.newRequest()
if err != nil {
t.Fatalf("unexpected error: %s", err)
t.Fatal(err)
}
switch tc.vm.dataSourceType {
case "", datasourcePrometheus:
@@ -727,9 +735,9 @@ func TestHeaders(t *testing.T) {
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
vm := tt.vmFn()
req, err := vm.newRequestPOST()
req, err := vm.newQueryRequest("foo", time.Now())
if err != nil {
t.Fatalf("unexpected error: %s", err)
t.Fatal(err)
}
tt.checkFn(t, req)
})

View File

@@ -18,6 +18,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@@ -46,8 +47,8 @@ all files with prefix rule_ in folder dir.
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
`)
ruleTemplatesPath = flagutil.NewArrayString("rule.templates", `Path or glob pattern to location with go template definitions
for rules annotations templating. Flag can be specified multiple times.
ruleTemplatesPath = flagutil.NewArrayString("rule.templates", `Path or glob pattern to location with go template definitions `+
`for rules annotations templating. Flag can be specified multiple times.
Examples:
-rule.templates="/path/to/file". Path to a single file with go templates
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
@@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
@@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
)
@@ -101,6 +93,7 @@ func main() {
remoteread.InitSecretFlags()
remotewrite.InitSecretFlags()
datasource.InitSecretFlags()
notifier.InitSecretFlags()
buildinfo.Init()
logger.Init()
pushmetrics.Init()
@@ -228,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
groups: make(map[uint64]*rule.Group),
querierBuilder: q,
notifiers: nts,
labels: labels,
@@ -237,7 +230,9 @@ func newManager(ctx context.Context) (*manager, error) {
if err != nil {
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
}
manager.rw = rw
if rw != nil {
manager.rw = rw
}
rr, err := remoteread.Init()
if err != nil {

View File

@@ -8,11 +8,19 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
rule.SkipRandSleepOnGroupStart = true
}
func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false)
@@ -87,10 +95,10 @@ groups:
)
f, err := os.CreateTemp("", "")
defer os.Remove(f.Name())
if err != nil {
t.Fatal(err)
}
defer func() { _ = os.Remove(f.Name()) }()
writeToFile(t, f.Name(), rules1)
*configCheckInterval = 200 * time.Millisecond
@@ -98,10 +106,10 @@ groups:
ctx, cancel := context.WithCancel(context.Background())
m := &manager{
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*Group),
querierBuilder: &datasource.FakeQuerier{},
groups: make(map[uint64]*rule.Group),
labels: map[string]string{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
rw: &remotewrite.Client{},
}

View File

@@ -3,14 +3,13 @@ package main
import (
"context"
"fmt"
"net/url"
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
@@ -19,7 +18,7 @@ type manager struct {
querierBuilder datasource.QuerierBuilder
notifiers func() []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
// remote read builder.
rr datasource.QuerierBuilder
@@ -27,28 +26,28 @@ type manager struct {
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
groups map[uint64]*rule.Group
}
// RuleAPI generates APIRule object from alert by its ID(hash)
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
// ruleAPI generates apiRule object from alert by its ID(hash)
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
if rule.ID() == rID {
return rule.ToAPI(), nil
return ruleToAPI(rule), nil
}
}
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
}
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
// alertAPI generates apiAlert object from alert by its ID(hash)
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
@@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
if !ok {
return nil, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
for _, r := range g.Rules {
ar, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
return apiAlert, nil
}
}
@@ -82,15 +81,15 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error {
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
m.wg.Add(1)
id := g.ID()
go func() {
defer m.wg.Done()
if restore {
g.start(ctx, m.notifiers, m.rw, m.rr)
g.Start(ctx, m.notifiers, m.rw, m.rr)
} else {
g.start(ctx, m.notifiers, m.rw, nil)
g.Start(ctx, m.notifiers, m.rw, nil)
}
}()
m.groups[id] = g
@@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
var rrPresent, arPresent bool
groupsRegistry := make(map[uint64]*Group)
groupsRegistry := make(map[uint64]*rule.Group)
for _, cfg := range groupsCfg {
for _, r := range cfg.Rules {
if rrPresent && arPresent {
@@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
arPresent = true
}
}
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
@@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
}
type updateItem struct {
old *Group
new *Group
old *rule.Group
new *rule.Group
}
var toUpdate []updateItem
@@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
if !ok {
// old group is not present in new list,
// so must be stopped and deleted
og.close()
og.Close()
delete(m.groups, og.ID())
og = nil
continue
@@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
go func(old *rule.Group, new *rule.Group) {
old.UpdateWith(new)
wg.Done()
}(item.old, item.new)
item.old.interruptEval()
item.old.InterruptEval()
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]APIRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, r.ToAPI())
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View File

@@ -10,8 +10,10 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
@@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
// successful cases of
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
m := &manager{groups: make(map[uint64]*rule.Group)}
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
if err := m.update(context.Background(), cfg, false); err != nil {
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
@@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
@@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
}()
var (
VMRows = &AlertingRule{
VMRows = &rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 10 * time.Second,
@@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "{{$labels}}",
},
}
Conns = &AlertingRule{
Conns = &rule.AlertingRule{
Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{
@@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
}
ExampleAlertAlwaysFiring = &AlertingRule{
ExampleAlertAlwaysFiring = &rule.AlertingRule{
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
@@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
name string
initPath string
updatePath string
want []*Group
want []*rule.Group
}{
{
name: "update good rules",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
Rules: []rule.Rule{
&rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 5 * time.Minute,
@@ -153,64 +155,68 @@ func TestManagerUpdate(t *testing.T) {
name: "update good rules from 1 to 2 groups",
initPath: "config/testdata/dir/rules/rules1-good.rules",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
},
},
{
name: "update with one bad rule file",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup",
Type: config.NewPrometheusType(),
Rules: []Rule{
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
},
},
{
name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@@ -222,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
@@ -253,18 +259,44 @@ func TestManagerUpdate(t *testing.T) {
})
}
}
func compareGroups(t *testing.T, a, b *rule.Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := rule.CompareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func TestManagerUpdateNegative(t *testing.T) {
testCases := []struct {
notifiers []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
cfg config.Group
expErr string
}{
{
nil,
nil,
config.Group{Name: "Recording rule only",
config.Group{
Name: "Recording rule only",
Rules: []config.Rule{
{Record: "record", Expr: "max(up)"},
},
@@ -274,7 +306,8 @@ func TestManagerUpdateNegative(t *testing.T) {
{
nil,
nil,
config.Group{Name: "Alerting rule only",
config.Group{
Name: "Alerting rule only",
Rules: []config.Rule{
{Alert: "alert", Expr: "up > 0"},
},
@@ -282,9 +315,10 @@ func TestManagerUpdateNegative(t *testing.T) {
"contains alerting rules",
},
{
[]notifier.Notifier{&fakeNotifier{}},
[]notifier.Notifier{&notifier.FakeNotifier{}},
nil,
config.Group{Name: "Recording and alerting rules",
config.Group{
Name: "Recording and alerting rules",
Rules: []config.Rule{
{Alert: "alert1", Expr: "up > 0"},
{Alert: "alert2", Expr: "up > 0"},
@@ -296,7 +330,8 @@ func TestManagerUpdateNegative(t *testing.T) {
{
nil,
&remotewrite.Client{},
config.Group{Name: "Recording and alerting rules",
config.Group{
Name: "Recording and alerting rules",
Rules: []config.Rule{
{Record: "record1", Expr: "max(up)"},
{Record: "record2", Expr: "max(up)"},
@@ -310,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.cfg.Name, func(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
rw: tc.rw,
}
if tc.notifiers != nil {
@@ -340,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
}
return cfg
}
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
@@ -17,7 +18,7 @@ import (
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
addr string
addr *url.URL
argFunc AlertURLGenerator
client *http.Client
timeout time.Duration
@@ -48,7 +49,12 @@ func (am *AlertManager) Close() {
}
// Addr returns address where alerts are sent.
func (am AlertManager) Addr() string { return am.addr }
func (am AlertManager) Addr() string {
if *showNotifierURL {
return am.addr.String()
}
return am.addr.Redacted()
}
// Send an alert or resolve message
func (am *AlertManager) Send(ctx context.Context, alerts []Alert, headers map[string]string) error {
@@ -64,7 +70,7 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
b := &bytes.Buffer{}
writeamRequest(b, alerts, am.argFunc, am.relabelConfigs)
req, err := http.NewRequest(http.MethodPost, am.addr, b)
req, err := http.NewRequest(http.MethodPost, am.addr.String(), b)
if err != nil {
return err
}
@@ -82,7 +88,10 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
req = req.WithContext(ctx)
if am.authCfg != nil {
am.authCfg.SetHeaders(req, true)
err = am.authCfg.SetHeaders(req, true)
if err != nil {
return err
}
}
resp, err := am.client.Do(req)
if err != nil {
@@ -91,12 +100,16 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
defer func() { _ = resp.Body.Close() }()
amURL := am.addr.Redacted()
if *showNotifierURL {
amURL = am.addr.String()
}
if resp.StatusCode != http.StatusOK {
body, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response from %q: %w", am.addr, err)
return fmt.Errorf("failed to read response from %q: %w", amURL, err)
}
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.addr, string(body))
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, amURL, string(body))
}
return nil
}
@@ -136,8 +149,15 @@ func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, authCfg proma
return nil, fmt.Errorf("failed to configure auth: %w", err)
}
amURL, err := url.Parse(alertManagerURL)
if err != nil {
return nil, fmt.Errorf("provided incorrect notifier url: %w", err)
}
if !*showNotifierURL {
alertManagerURL = amURL.Redacted()
}
return &AlertManager{
addr: alertManagerURL,
addr: amURL,
argFunc: fn,
authCfg: aCfg,
relabelConfigs: relabelCfg,

View File

@@ -3,7 +3,6 @@ package notifier
import (
"crypto/md5"
"fmt"
"gopkg.in/yaml.v2"
"net/url"
"os"
"path"
@@ -11,6 +10,8 @@ import (
"strings"
"time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul"
@@ -142,26 +143,23 @@ func parseLabels(target string, metaLabels *promutils.Labels, cfg *Config) (stri
if labels.Len() == 0 {
return "", nil, nil
}
schemeRelabeled := labels.Get("__scheme__")
if len(schemeRelabeled) == 0 {
schemeRelabeled = "http"
scheme := labels.Get("__scheme__")
if len(scheme) == 0 {
scheme = "http"
}
addressRelabeled := labels.Get("__address__")
if len(addressRelabeled) == 0 {
alertsPath := labels.Get("__alerts_path__")
if !strings.HasPrefix(alertsPath, "/") {
alertsPath = "/" + alertsPath
}
address := labels.Get("__address__")
if len(address) == 0 {
return "", nil, nil
}
if strings.Contains(addressRelabeled, "/") {
return "", nil, nil
}
addressRelabeled = addMissingPort(schemeRelabeled, addressRelabeled)
alertsPathRelabeled := labels.Get("__alerts_path__")
if !strings.HasPrefix(alertsPathRelabeled, "/") {
alertsPathRelabeled = "/" + alertsPathRelabeled
}
u := fmt.Sprintf("%s://%s%s", schemeRelabeled, addressRelabeled, alertsPathRelabeled)
address = addMissingPort(scheme, address)
u := fmt.Sprintf("%s://%s%s", scheme, address, alertsPath)
if _, err := url.Parse(u); err != nil {
return "", nil, fmt.Errorf("invalid url %q for scheme=%q (%q), target=%q, metrics_path=%q (%q): %w",
u, cfg.Scheme, schemeRelabeled, target, addressRelabeled, alertsPathRelabeled, err)
u, cfg.Scheme, scheme, target, address, alertsPath, err)
}
return u, labels, nil
}
@@ -181,9 +179,24 @@ func addMissingPort(scheme, target string) string {
func mergeLabels(target string, metaLabels *promutils.Labels, cfg *Config) *promutils.Labels {
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
m := promutils.NewLabels(3 + metaLabels.Len())
m.Add("__address__", target)
m.Add("__scheme__", cfg.Scheme)
m.Add("__alerts_path__", path.Join("/", cfg.PathPrefix, alertManagerPath))
address := target
scheme := cfg.Scheme
alertsPath := path.Join("/", cfg.PathPrefix, alertManagerPath)
// try to extract optional scheme and alertsPath from __address__.
if strings.HasPrefix(address, "http://") {
scheme = "http"
address = address[len("http://"):]
} else if strings.HasPrefix(address, "https://") {
scheme = "https"
address = address[len("https://"):]
}
if n := strings.IndexByte(address, '/'); n >= 0 {
alertsPath = address[n:]
address = address[:n]
}
m.Add("__address__", address)
m.Add("__scheme__", scheme)
m.Add("__alerts_path__", alertsPath)
m.AddFrom(metaLabels)
return m
}

View File

@@ -87,7 +87,7 @@ func (cw *configWatcher) reload(path string) error {
func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn getLabels) error {
targets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
for _, err := range errors {
return fmt.Errorf("failed to init notifier for %q: %s", typeK, err)
return fmt.Errorf("failed to init notifier for %q: %w", typeK, err)
}
cw.setTargets(typeK, targets)
@@ -107,7 +107,7 @@ func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn
}
updateTargets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
for _, err := range errors {
logger.Errorf("failed to init notifier for %q: %s", typeK, err)
logger.Errorf("failed to init notifier for %q: %w", typeK, err)
}
cw.setTargets(typeK, updateTargets)
}
@@ -118,7 +118,7 @@ func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn
func targetsFromLabels(labelsFn getLabels, cfg *Config, genFn AlertURLGenerator) ([]Target, []error) {
metaLabels, err := labelsFn()
if err != nil {
return nil, []error{fmt.Errorf("failed to get labels: %s", err)}
return nil, []error{fmt.Errorf("failed to get labels: %w", err)}
}
var targets []Target
var errors []error
@@ -167,11 +167,11 @@ func (cw *configWatcher) start() error {
for _, target := range cfg.Targets {
address, labels, err := parseLabels(target, nil, cw.cfg)
if err != nil {
return fmt.Errorf("failed to parse labels for target %q: %s", target, err)
return fmt.Errorf("failed to parse labels for target %q: %w", target, err)
}
notifier, err := NewAlertManager(address, cw.genFn, httpCfg, cw.cfg.parsedAlertRelabelConfigs, cw.cfg.Timeout.Duration())
if err != nil {
return fmt.Errorf("failed to init alertmanager for addr %q: %s", address, err)
return fmt.Errorf("failed to init alertmanager for addr %q: %w", address, err)
}
targets = append(targets, Target{
Notifier: notifier,
@@ -189,14 +189,14 @@ func (cw *configWatcher) start() error {
sdc := &cw.cfg.ConsulSDConfigs[i]
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
if err != nil {
return nil, fmt.Errorf("got labels err: %s", err)
return nil, fmt.Errorf("got labels err: %w", err)
}
labels = append(labels, targetLabels...)
}
return labels, nil
})
if err != nil {
return fmt.Errorf("failed to start consulSD discovery: %s", err)
return fmt.Errorf("failed to start consulSD discovery: %w", err)
}
}
@@ -207,14 +207,14 @@ func (cw *configWatcher) start() error {
sdc := &cw.cfg.DNSSDConfigs[i]
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
if err != nil {
return nil, fmt.Errorf("got labels err: %s", err)
return nil, fmt.Errorf("got labels err: %w", err)
}
labels = append(labels, targetLabels...)
}
return labels, nil
})
if err != nil {
return fmt.Errorf("failed to start DNSSD discovery: %s", err)
return fmt.Errorf("failed to start DNSSD discovery: %w", err)
}
}
return nil

View File

@@ -14,7 +14,6 @@ import (
func TestConfigWatcherReload(t *testing.T) {
f, err := os.CreateTemp("", "")
defer os.Remove(f.Name())
if err != nil {
t.Fatal(err)
}
@@ -37,7 +36,6 @@ static_configs:
}
f2, err := os.CreateTemp("", "")
defer os.Remove(f2.Name())
if err != nil {
t.Fatal(err)
}
@@ -65,7 +63,6 @@ func TestConfigWatcherStart(t *testing.T) {
defer consulSDServer.Close()
consulSDFile, err := os.CreateTemp("", "")
defer os.Remove(consulSDFile.Name())
if err != nil {
t.Fatal(err)
}
@@ -112,7 +109,6 @@ func TestConfigWatcherReloadConcurrent(t *testing.T) {
defer consulSDServer2.Close()
consulSDFile, err := os.CreateTemp("", "")
defer os.Remove(consulSDFile.Name())
if err != nil {
t.Fatal(err)
}
@@ -129,7 +125,6 @@ consul_sd_configs:
`, consulSDServer1.URL, consulSDServer2.URL))
staticAndConsulSDFile, err := os.CreateTemp("", "")
defer os.Remove(staticAndConsulSDFile.Name())
if err != nil {
t.Fatal(err)
}
@@ -323,3 +318,47 @@ func TestMergeHTTPClientConfigs(t *testing.T) {
t.Fatalf("expected BasicAuth tp be present")
}
}
func TestParseLabels(t *testing.T) {
testCases := []struct {
name string
target string
cfg *Config
expectedAddress string
expectedErr bool
}{
{
"invalid address",
"invalid:*//url",
&Config{},
"",
true,
},
{
"use some default params",
"alertmanager:9093",
&Config{PathPrefix: "test"},
"http://alertmanager:9093/test/api/v2/alerts",
false,
},
{
"use target address",
"https://alertmanager:9093/api/v1/alerts",
&Config{Scheme: "http", PathPrefix: "test"},
"https://alertmanager:9093/api/v1/alerts",
false,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
address, _, err := parseLabels(tc.target, nil, tc.cfg)
if err == nil == tc.expectedErr {
t.Fatalf("unexpected error; got %t; want %t", err != nil, tc.expectedErr)
}
if address != tc.expectedAddress {
t.Fatalf("unexpected address; got %q; want %q", address, tc.expectedAddress)
}
})
}
}

View File

@@ -0,0 +1,59 @@
package notifier
import (
"context"
"fmt"
"sync"
"time"
)
// FakeNotifier is a mock notifier
type FakeNotifier struct {
sync.Mutex
alerts []Alert
// records number of received alerts in total
counter int
}
// Close does nothing
func (*FakeNotifier) Close() {}
// Addr returns ""
func (*FakeNotifier) Addr() string { return "" }
// Send sets alerts and increases counter
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
// GetCounter returns received alerts count
func (fn *FakeNotifier) GetCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
// GetAlerts returns stored alerts
func (fn *FakeNotifier) GetAlerts() []Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
// FaultyNotifier is a mock notifier that Send() will return failed response
type FaultyNotifier struct {
FakeNotifier
}
// Send returns failed response
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}

View File

@@ -19,6 +19,8 @@ var (
addrs = flagutil.NewArrayString("notifier.url", "Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. "+
"List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.")
showNotifierURL = flag.Bool("notifier.showURL", false, "Whether to avoid stripping sensitive information such as passwords from URL in log messages or UI for -notifier.url. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
blackHole = flag.Bool("notifier.blackhole", false, "Whether to blackhole alerting notifications. "+
"Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). "+
"`-notifier.url`, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.")
@@ -88,7 +90,7 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
externalLabels = extLabels
eu, err := url.Parse(externalURL)
if err != nil {
return nil, fmt.Errorf("failed to parse external URL: %s", err)
return nil, fmt.Errorf("failed to parse external URL: %w", err)
}
templates.UpdateWithFuncs(templates.FuncsWithExternalURL(eu))
@@ -114,7 +116,7 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
if len(*addrs) > 0 {
notifiers, err := notifiersFromFlags(gen)
if err != nil {
return nil, fmt.Errorf("failed to create notifier from flag values: %s", err)
return nil, fmt.Errorf("failed to create notifier from flag values: %w", err)
}
staticNotifiersFn = func() []Notifier {
return notifiers
@@ -124,11 +126,18 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
cw, err = newWatcher(*configPath, gen)
if err != nil {
return nil, fmt.Errorf("failed to init config watcher: %s", err)
return nil, fmt.Errorf("failed to init config watcher: %w", err)
}
return cw.notifiers, nil
}
// InitSecretFlags must be called after flag.Parse and before any logging
func InitSecretFlags() {
if !*showNotifierURL {
flagutil.RegisterSecretFlag("notifier.url")
}
}
func notifiersFromFlags(gen AlertURLGenerator) ([]Notifier, error) {
var notifiers []Notifier
for i, addr := range *addrs {

View File

@@ -5,6 +5,7 @@ static_configs:
- targets:
- localhost:9093
- localhost:9095
- https://localhost:9093/test/api/v2/alerts
basic_auth:
username: foo
password: bar

View File

@@ -0,0 +1,345 @@
package remotewrite
import (
"bytes"
"context"
"errors"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write client.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
rwTotal.Inc()
select {
case <-c.doneCh:
rwErrors.Inc()
droppedRows.Add(len(s.Samples))
droppedBytes.Add(s.Size())
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
rwErrors.Inc()
droppedRows.Add(len(s.Samples))
droppedBytes.Add(s.Size())
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
rwErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
rwTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if errors.Is(err, io.EOF) {
// Something in the middle between client and destination might be closing
// the connection. So we do a one more attempt in hope request will succeed.
err = c.send(ctx, b)
}
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
rwErrors.Inc()
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return &nonRetriableError{
err: err,
}
}
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with HTTP 2xx status code when write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{
err: fmt.Errorf("unexpected response code %d for %s. Response body %q", resp.StatusCode, req.URL.Redacted(), body),
}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
}

View File

@@ -0,0 +1,97 @@
package remotewrite
import (
"bytes"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// DebugClient won't push series periodically, but will write data to remote endpoint
// immediately when Push() is called
type DebugClient struct {
addr string
c *http.Client
wg sync.WaitGroup
}
// NewDebugClient initiates and returns a new DebugClient
func NewDebugClient() (*DebugClient, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &DebugClient{
c: &http.Client{
Timeout: *sendTimeout,
Transport: t,
},
addr: strings.TrimSuffix(*addr, "/"),
}
return c, nil
}
// Push sends the given timeseries to the remote storage.
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
c.wg.Add(1)
defer c.wg.Done()
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
data, err := wr.Marshal()
if err != nil {
return fmt.Errorf("failed to marshal the given time series: %w", err)
}
return c.send(data)
}
// Close stops the DebugClient
func (c *DebugClient) Close() error {
c.wg.Wait()
return nil
}
func (c *DebugClient) send(data []byte) error {
b := snappy.Encode(nil, data)
r := bytes.NewReader(b)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req)
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode/100 == 2 {
return nil
}
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}

View File

@@ -0,0 +1,50 @@
package remotewrite
import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestDebugClient_Push(t *testing.T) {
testSrv := newRWServer()
oldAddr := *addr
*addr = testSrv.URL
defer func() {
*addr = oldAddr
}()
client, err := NewDebugClient()
if err != nil {
t.Fatalf("failed to create debug client: %s", err)
}
const rowsN = 100
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: float64(i),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}

View File

@@ -30,7 +30,7 @@ var (
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines max number of timeseries to be flushed at once")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote write endpoint")
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")

View File

@@ -1,320 +1,13 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer sendDuration.Add(time.Since(timeStart).Seconds())
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
// RWClient represents an HTTP client for pushing data via remote write protocol
type RWClient interface {
// Push pushes the give time series to remote storage
Push(s prompbmarshal.TimeSeries) error
// Close stops the client. Client can't be reused after Close call.
Close() error
}

View File

@@ -1,19 +1,16 @@
package main
import (
"context"
"flag"
"fmt"
"strings"
"time"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
var (
@@ -33,17 +30,17 @@ var (
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
)
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
if *replayMaxDatapoints < 1 {
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
}
tFrom, err := time.Parse(time.RFC3339, *replayFrom)
if err != nil {
return fmt.Errorf("failed to parse %q: %s", *replayFrom, err)
return fmt.Errorf("failed to parse %q: %w", *replayFrom, err)
}
tTo, err := time.Parse(time.RFC3339, *replayTo)
if err != nil {
return fmt.Errorf("failed to parse %q: %s", *replayTo, err)
return fmt.Errorf("failed to parse %q: %w", *replayTo, err)
}
if !tTo.After(tFrom) {
return fmt.Errorf("replay.timeTo must be bigger than replay.timeFrom")
@@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
var total int
for _, cfg := range groupsCfg {
ng := newGroup(cfg, qb, *evaluationInterval, labels)
total += ng.replay(tFrom, tTo, rw)
ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
}
logger.Infof("replay finished! Imported %d samples", total)
if rw != nil {
@@ -77,97 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
}
return nil
}
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
var total int
step := g.Interval * time.Duration(*replayMaxDatapoints)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !*disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(*replayRulesDelay)
}
return total
}
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < *replayRuleRetryAttempts; i++ {
tss, err = rule.ExecRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}

View File

@@ -12,7 +12,7 @@ import (
)
type fakeReplayQuerier struct {
fakeQuerier
datasource.FakeQuerier
registry map[string]map[string]struct{}
}
@@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
})
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View File

@@ -1,118 +0,0 @@
package main
import (
"context"
"errors"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// ExecRange executes the rule on the given time range.
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// ToAPI converts Rule into APIRule
ToAPI() APIRule
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []ruleStateEntry
cur int
}
type ruleStateEntry struct {
// stores last moment of time rule.Exec was called
time time.Time
// stores the timesteamp with which rule.Exec was called
at time.Time
// stores the duration of the last rule.Exec call
duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
err error
// stores the number of samples returned during
// the last evaluation
samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
seriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
curl string
}
func newRuleState(size int) *ruleState {
if size < 1 {
size = 1
}
return &ruleState{
entries: make([]ruleStateEntry, size),
}
}
func (s *ruleState) getLast() ruleStateEntry {
s.RLock()
defer s.RUnlock()
return s.entries[s.cur]
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getAll() []ruleStateEntry {
entries := make([]ruleStateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.time.IsZero() || !e.at.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e ruleStateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}

View File

@@ -1,11 +1,10 @@
package main
package rule
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"time"
@@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
seriesFetched *utils.Gauge
}
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
// NewAlertingRule creates a new AlertingRule
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
Type: group.Type,
RuleID: cfg.ID,
@@ -80,13 +80,18 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
metrics: &alertingRuleMetrics{},
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
ar.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
ar.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
ar.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
labels := fmt.Sprintf(`alertname=%q, group=%q, file=%q, id="%d"`, ar.Name, group.Name, group.File, ar.ID())
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
func() float64 {
ar.alertsMu.RLock()
@@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := ar.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.seriesFetched == nil {
if e.SeriesFetched == nil {
// means seriesFetched is unsupported
return -1
}
seriesFetched := float64(*e.seriesFetched)
if seriesFetched == 0 && e.samples > 0 {
seriesFetched := float64(*e.SeriesFetched)
if seriesFetched == 0 && e.Samples > 0 {
// `alert: 0.95` will fetch no series
// but will get one time series in response.
seriesFetched = float64(e.samples)
seriesFetched = float64(e.Samples)
}
return seriesFetched
})
return ar
}
// Close unregisters rule metrics
func (ar *AlertingRule) Close() {
// close unregisters rule metrics
func (ar *AlertingRule) close() {
ar.metrics.active.Unregister()
ar.metrics.pending.Unregister()
ar.metrics.errors.Unregister()
@@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// GetAlerts returns active alerts of rule
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var alerts []*notifier.Alert
for _, a := range ar.alerts {
alerts = append(alerts, a)
}
return alerts
}
// GetAlert returns alert if id exists
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
if ar.alerts == nil {
return nil
}
return ar.alerts[id]
}
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
if !ar.Debug {
return
@@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
logger.Infof("%s", prefix+msg)
}
// updateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) updateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
type labelSet struct {
// origin labels extracted from received time series
// plus extra labels (group labels, service labels like alertNameLabel).
@@ -223,7 +269,7 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
Expr: ar.Expr,
})
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
return nil, fmt.Errorf("failed to expand labels: %w", err)
}
for k, v := range extraLabels {
ls.processed[k] = v
@@ -248,25 +294,34 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
return ls, nil
}
// ExecRange executes alerting rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
// execRange executes alerting rule on the given time range similarly to exec.
// When making consecutive calls make sure to respect time linearity for start and end params,
// as this function modifies AlertingRule alerts state.
// It is not thread safe.
// It returns ALERT and ALERT_FOR_STATE time series as a result.
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil {
return nil, err
}
var result []prompbmarshal.TimeSeries
holdAlertState := make(map[uint64]*notifier.Alert)
qFn := func(query string) ([]datasource.Metric, error) {
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
}
for _, s := range res.Data {
ls, err := ar.toLabels(s, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
h := hash(ls.processed)
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
if err != nil {
return nil, fmt.Errorf("failed to create alert: %s", err)
return nil, fmt.Errorf("failed to create alert: %w", err)
}
if ar.For == 0 { // if alert is instant
// if alert is instant, For: 0
if ar.For == 0 {
a.State = notifier.StateFiring
for i := range s.Values {
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
@@ -278,18 +333,32 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
prevT := time.Time{}
for i := range s.Values {
at := time.Unix(s.Timestamps[i], 0)
// try to restore alert's state on the first iteration
if at.Equal(start) {
if _, ok := ar.alerts[h]; ok {
a = ar.alerts[h]
prevT = at
}
}
if at.Sub(prevT) > ar.EvalInterval {
// reset to Pending if there are gaps > EvalInterval between DPs
a.State = notifier.StatePending
a.ActiveAt = at
} else if at.Sub(a.ActiveAt) >= ar.For {
a.Start = time.Time{}
} else if at.Sub(a.ActiveAt) >= ar.For && a.State != notifier.StateFiring {
a.State = notifier.StateFiring
a.Start = at
}
prevT = at
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
// save alert's state on last iteration, so it can be used on the next execRange call
if at.Equal(end) {
holdAlertState[h] = a
}
}
}
ar.alerts = holdAlertState
return result, nil
}
@@ -297,19 +366,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute
// Exec executes AlertingRule expression via the given Querier.
// exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
err: err,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Err: err,
Curl: requestToCurl(req),
}
defer func() {
@@ -323,7 +392,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
@@ -342,15 +411,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
for _, m := range res.Data {
ls, err := ar.toLabels(m, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to expand labels: %w", err)
return nil, curState.Err
}
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.Err
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
@@ -373,8 +442,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
a, err := ar.newAlert(m, ls, start, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.Err
}
a.ID = h
a.State = notifier.StatePending
@@ -423,8 +492,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
if limit > 0 && numActivePending > limit {
ar.alerts = map[uint64]*notifier.Alert{}
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.Err
}
return ar.toTimeSeries(ts.Unix()), nil
}
@@ -441,26 +510,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
return tss
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
// TODO: consider hashing algorithm in VM
func hash(labels map[string]string) uint64 {
hash := fnv.New64a()
@@ -487,7 +536,7 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
if ls == nil {
ls, err = ar.toLabels(m, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
return nil, fmt.Errorf("failed to expand labels: %w", err)
}
}
a := &notifier.Alert{
@@ -503,102 +552,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
return a, err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// ToAPI returns Rule representation in form of APIRule
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
// satisfy APIRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// AlertsToAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
var alerts []*APIAlert
ar.alertsMu.RLock()
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.alertsMu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
aa := &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
const (
// alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
@@ -646,10 +599,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
}
// Restore restores the value of ActiveAt field for active alerts,
// restore restores the value of ActiveAt field for active alerts,
// based on previously written time series `alertForStateMetricName`.
// Only rules with For > 0 can be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
if ar.For < 1 {
return nil
}
@@ -661,44 +614,41 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts ti
return nil
}
for _, a := range ar.alerts {
nameStr := fmt.Sprintf("%s=%q", alertNameLabel, ar.Name)
if !*disableAlertGroupLabel {
nameStr = fmt.Sprintf("%s=%q,%s=%q", alertGroupNameLabel, ar.GroupName, alertNameLabel, ar.Name)
}
var labelsFilter string
for k, v := range ar.Labels {
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
}
expr := fmt.Sprintf("last_over_time(%s{%s%s}[%ds])",
alertForStateMetricName, nameStr, labelsFilter, int(lookback.Seconds()))
res, _, err := q.Query(ctx, expr, ts)
if err != nil {
return fmt.Errorf("failed to execute restore query %q: %w ", expr, err)
}
if len(res.Data) < 1 {
ar.logDebugf(ts, nil, "no response was received from restore query")
return nil
}
for _, series := range res.Data {
series.DelLabel("__name__")
labelSet := make(map[string]string, len(series.Labels))
for _, v := range series.Labels {
labelSet[v.Name] = v.Value
}
id := hash(labelSet)
a, ok := ar.alerts[id]
if !ok {
continue
}
if a.Restored || a.State != notifier.StatePending {
continue
}
var labelsFilter []string
for k, v := range a.Labels {
labelsFilter = append(labelsFilter, fmt.Sprintf("%s=%q", k, v))
}
sort.Strings(labelsFilter)
expr := fmt.Sprintf("last_over_time(%s{%s}[%ds])",
alertForStateMetricName, strings.Join(labelsFilter, ","), int(lookback.Seconds()))
ar.logDebugf(ts, nil, "restoring alert state via query %q", expr)
res, _, err := q.Query(ctx, expr, ts)
if err != nil {
return err
}
qMetrics := res.Data
if len(qMetrics) < 1 {
ar.logDebugf(ts, nil, "no response was received from restore query")
continue
}
// only one series expected in response
m := qMetrics[0]
// __name__ supposed to be alertForStateMetricName
m.DelLabel("__name__")
// we assume that restore query contains all label matchers,
// so all received labels will match anyway if their number is equal.
if len(m.Labels) != len(a.Labels) {
ar.logDebugf(ts, nil, "state restore query returned not expected label-set %v", m.Labels)
continue
}
a.ActiveAt = time.Unix(int64(m.Values[0]), 0)
a.ActiveAt = time.Unix(int64(series.Values[0]), 0)
a.Restored = true
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
}

View File

@@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
for i, step := range tc.steps {
fq.reset()
fq.add(step...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
fq.Reset()
fq.Add(step...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
@@ -346,15 +346,18 @@ func TestAlertingRule_Exec(t *testing.T) {
}
func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"}
testCases := []struct {
rule *AlertingRule
data []datasource.Metric
expAlerts []*notifier.Alert
rule *AlertingRule
data []datasource.Metric
expAlerts []*notifier.Alert
expHoldAlertStateAlerts map[uint64]*notifier.Alert
}{
{
newTestAlertingRule("empty", 0),
[]datasource.Metric{},
nil,
nil,
},
{
newTestAlertingRule("empty labels", 0),
@@ -364,6 +367,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
[]*notifier.Alert{
{State: notifier.StateFiring},
},
nil,
},
{
newTestAlertingRule("single-firing", 0),
@@ -376,6 +380,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
State: notifier.StateFiring,
},
},
nil,
},
{
newTestAlertingRule("single-firing-on-range", 0),
@@ -387,6 +392,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StateFiring},
{State: notifier.StateFiring},
},
nil,
},
{
newTestAlertingRule("for-pending", time.Second),
@@ -398,6 +404,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-pending",
Labels: map[string]string{"alertname": "for-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
},
{
newTestAlertingRule("for-firing", 3*time.Second),
@@ -409,6 +425,38 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
GroupID: fakeGroup.ID(),
Name: "for-firing",
Labels: map[string]string{"alertname": "for-firing"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
}},
},
{
newTestAlertingRule("for-hold-pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-hold-pending",
Labels: map[string]string{"alertname": "for-hold-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
},
{
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
@@ -422,12 +470,14 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
},
nil,
},
{
newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
newTestAlertingRule("multi-series", 3*time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
{
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
@@ -435,22 +485,49 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
//
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
{
State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{
"foo": "bar",
}},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
},
},
{
State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
Labels: map[string]string{
"foo": "bar",
}},
},
},
},
map[uint64]*notifier.Alert{
hash(map[string]string{"alertname": "multi-series"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series", "foo": "bar"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
},
},
{
newTestRuleWithLabels("multi-series-firing", "source", "vm"),
[]datasource.Metric{
{Values: []float64{1, 1}, Timestamps: []int64{1, 100}},
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
{
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
@@ -471,16 +548,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
"source": "vm",
}},
},
nil,
},
}
fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.data...)
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
fq.Add(tc.data...)
gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
@@ -506,30 +583,35 @@ func TestAlertingRule_ExecRange(t *testing.T) {
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
}
}
if tc.expHoldAlertStateAlerts != nil {
if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
}
}
})
}
}
func TestGroup_Restore(t *testing.T) {
defaultTS := time.Now()
fqr := &fakeQuerierWithRegistry{}
fqr := &datasource.FakeQuerierWithRegistry{}
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
t.Helper()
defer fqr.reset()
defer fqr.Reset()
for _, r := range rules {
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
}
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }
fg.start(context.Background(), nts, nil, fqr)
nts := func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} }
fg.Start(context.Background(), nts, nil, fqr)
wg.Done()
}()
fg.close()
fg.Close()
wg.Wait()
gotAlerts := make(map[uint64]*notifier.Alert)
@@ -558,6 +640,9 @@ func TestGroup_Restore(t *testing.T) {
if got.ActiveAt != exp.ActiveAt {
t.Fatalf("expected ActiveAt %v; got %v", exp.ActiveAt, got.ActiveAt)
}
if got.Name != exp.Name {
t.Fatalf("expected alertname %q; got %q", exp.Name, got.Name)
}
}
}
@@ -573,44 +658,28 @@ func TestGroup_Restore(t *testing.T) {
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
fqr.reset()
fqr.Reset()
// one active alert with state restore
ts := time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
ActiveAt: ts},
Name: "foo",
ActiveAt: ts,
},
})
// two rules, two active alerts, one with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
ActiveAt: defaultTS,
},
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
ActiveAt: ts},
})
// two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
@@ -619,74 +688,102 @@ func TestGroup_Restore(t *testing.T) {
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
Name: "bar",
ActiveAt: ts,
},
})
// two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: ts,
},
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
ActiveAt: ts},
Name: "bar",
ActiveAt: ts,
},
})
// one active alert but wrong state restore
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
stateMetric("wrong alert", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
// one active alert with labels
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
Name: "foo",
ActiveAt: ts,
},
})
// one active alert with restore labels missmatch
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev", "team", "foo"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.reset()
fq.Reset()
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.SetErr(errors.New(expErr))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@@ -696,7 +793,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
}
func TestAlertingRuleLimit(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
@@ -728,15 +825,15 @@ func TestAlertingRuleLimit(t *testing.T) {
err error
timestamp = time.Now()
)
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
for _, testCase := range testCases {
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit)
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
}
fq.reset()
fq.Reset()
}
func TestAlertingRule_Template(t *testing.T) {
@@ -844,7 +941,8 @@ func TestAlertingRule_Template(t *testing.T) {
hash(map[string]string{
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
"instance": "foo"}): {
"instance": "foo",
}): {
Labels: map[string]string{
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
@@ -860,19 +958,18 @@ func TestAlertingRule_Template(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
tc.rule.state = newRuleState(10)
fq.add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
fq.Add(tc.metrics...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
gotAlert := tc.rule.alerts[hash]
if gotAlert == nil {
t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations)
break
}
if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations)
@@ -980,7 +1077,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
For: waitFor,
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(10),
state: &ruleState{entries: make([]StateEntry, 10)},
}
return &rule
}

View File

@@ -1,8 +1,10 @@
package main
package rule
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/fnv"
"net/url"
@@ -11,7 +13,7 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@@ -21,16 +23,34 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent group")
evalDelay = flag.Duration("rule.evalDelay", 30*time.Second, "Adjustment of the `time` parameter for rule evaluation requests to compensate intentional data delay from the datasource."+
"Normally, should be equal to `-search.latencyOffset` (cmd-line flag configured for VictoriaMetrics single-node or vmselect).")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
)
// Group is an entity for grouping rules
type Group struct {
mu sync.RWMutex
Name string
File string
Rules []Rule
Type config.Type
Interval time.Duration
mu sync.RWMutex
Name string
File string
Rules []Rule
Type config.Type
Interval time.Duration
EvalOffset *time.Duration
// EvalDelay will adjust timestamp for rule evaluation requests to compensate intentional query delay from datasource.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155
EvalDelay *time.Duration
Limit int
Concurrency int
Checksum string
@@ -51,6 +71,9 @@ type Group struct {
evalCancel context.CancelFunc
metrics *groupMetrics
// evalAlignment will make the timestamp of group query
// requests be aligned with interval
evalAlignment *bool
}
type groupMetrics struct {
@@ -92,7 +115,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
return r
}
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
// NewGroup returns a new group
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
@@ -105,6 +129,7 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
Headers: make(map[string]string),
NotifierHeaders: make(map[string]string),
Labels: cfg.Labels,
evalAlignment: cfg.EvalAlignment,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
@@ -116,6 +141,12 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
if g.Concurrency < 1 {
g.Concurrency = 1
}
if cfg.EvalOffset != nil {
g.EvalOffset = &cfg.EvalOffset.D
}
if cfg.EvalDelay != nil {
g.EvalDelay = &cfg.EvalDelay.D
}
for _, h := range cfg.Headers {
g.Headers[h.Key] = h.Value
}
@@ -145,11 +176,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
return g
}
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(qb, g, rule)
func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
if r.Alert != "" {
return NewAlertingRule(qb, g, r)
}
return newRecordingRule(qb, g, rule)
return NewRecordingRule(qb, g, r)
}
// ID return unique group ID that consists of
@@ -163,11 +194,15 @@ func (g *Group) ID() uint64 {
hash.Write([]byte("\xff"))
hash.Write([]byte(g.Name))
hash.Write([]byte(g.Type.Get()))
hash.Write([]byte(g.Interval.String()))
if g.EvalOffset != nil {
hash.Write([]byte(g.EvalOffset.String()))
}
return hash.Sum64()
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
// restore restores alerts state for group rules
func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
if !ok {
@@ -183,7 +218,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
Headers: g.Headers,
Debug: ar.Debug,
})
if err := ar.Restore(ctx, q, ts, lookback); err != nil {
if err := ar.restore(ctx, q, ts, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
@@ -193,7 +228,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
// updateWith updates existing group with
// passed group object. This function ignores group
// evaluation interval change. It supposed to be updated
// in group.start function.
// in group.Start function.
// Not thread-safe.
func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule)
@@ -206,11 +241,11 @@ func (g *Group) updateWith(newGroup *Group) error {
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i].Close()
g.Rules[i].close()
g.Rules[i] = nil
continue
}
if err := or.UpdateWith(nr); err != nil {
if err := or.updateWith(nr); err != nil {
return err
}
delete(rulesRegistry, nr.ID())
@@ -243,10 +278,10 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil
}
// interruptEval interrupts in-flight rules evaluations
// InterruptEval interrupts in-flight rules evaluations
// within the group. It is expected that g.evalCancel
// will be repopulated after the call.
func (g *Group) interruptEval() {
func (g *Group) InterruptEval() {
g.mu.RLock()
defer g.mu.RUnlock()
@@ -255,12 +290,13 @@ func (g *Group) interruptEval() {
}
}
func (g *Group) close() {
// Close stops the group and it's rules, unregisters group metrics
func (g *Group) Close() {
if g.doneCh == nil {
return
}
close(g.doneCh)
g.interruptEval()
g.InterruptEval()
<-g.finishedCh
g.metrics.iterationDuration.Unregister()
@@ -268,24 +304,25 @@ func (g *Group) close() {
g.metrics.iterationMissed.Unregister()
g.metrics.iterationInterval.Unregister()
for _, rule := range g.Rules {
rule.Close()
rule.close()
}
}
var skipRandSleepOnGroupStart bool
// SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
var SkipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
// Start starts group's evaluation
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
defer func() { close(g.finishedCh) }()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
if randSleep < sleepOffset {
randSleep += uint64(g.Interval)
}
randSleep -= sleepOffset
sleepTimer := time.NewTimer(time.Duration(randSleep))
evalTS := time.Now()
// sleep random duration to spread group rules evaluation
// over time in order to reduce load on datasource.
if !SkipRandSleepOnGroupStart {
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
g.infof("will start in %v", sleepBeforeStart)
sleepTimer := time.NewTimer(sleepBeforeStart)
select {
case <-ctx.Done():
sleepTimer.Stop()
@@ -295,18 +332,17 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
return
case <-sleepTimer.C:
}
evalTS = evalTS.Add(sleepBeforeStart)
}
e := &executor{
rw: rw,
notifiers: nts,
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
evalTS := time.Now()
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
g.infof("started")
eval := func(ctx context.Context, ts time.Time) {
g.metrics.iterationTotal.Inc()
@@ -320,6 +356,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
ts = g.adjustReqTimestamp(ts)
errs := e.execConcurrently(ctx, g.Rules, ts, g.Concurrency, resolveDuration, g.Limit)
for err := range errs {
if err != nil {
@@ -344,7 +381,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
// restore the rules state after the first evaluation
// so only active alerts can be restored.
if rr != nil {
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack)
err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
}
@@ -375,19 +412,12 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
continue
}
// ensure that staleness is tracked or existing rules only
// ensure that staleness is tracked for existing rules only
e.purgeStaleSeries(g.Rules)
e.notifierHeaders = g.NotifierHeaders
if g.Interval != ng.Interval {
g.Interval = ng.Interval
t.Stop()
t = time.NewTicker(g.Interval)
evalTS = time.Now()
}
g.mu.Unlock()
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
g.infof("re-started")
case <-t.C:
missed := (time.Since(evalTS) / g.Interval) - 1
if missed < 0 {
@@ -405,6 +435,134 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
}
// UpdateWith inserts new group to updateCh
func (g *Group) UpdateWith(new *Group) {
g.updateCh <- new
}
// DeepCopy returns a deep copy of group
func (g *Group) DeepCopy() *Group {
g.mu.RLock()
data, _ := json.Marshal(g)
g.mu.RUnlock()
newG := Group{}
_ = json.Unmarshal(data, &newG)
newG.Rules = g.Rules
return &newG
}
// delayBeforeStart returns a duration on the interval between [ts..ts+interval].
// delayBeforeStart accounts for `offset`, so returned duration should be always
// bigger than the `offset`.
func delayBeforeStart(ts time.Time, key uint64, interval time.Duration, offset *time.Duration) time.Duration {
var randSleep time.Duration
randSleep = time.Duration(float64(interval) * (float64(key) / (1 << 64)))
sleepOffset := time.Duration(ts.UnixNano() % interval.Nanoseconds())
if randSleep < sleepOffset {
randSleep += interval
}
randSleep -= sleepOffset
// check if `ts` after randSleep is before `offset`,
// if it is, add extra eval_offset to randSleep.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409.
if offset != nil {
tmpEvalTS := ts.Add(randSleep)
if tmpEvalTS.Before(tmpEvalTS.Truncate(interval).Add(*offset)) {
randSleep += *offset
}
}
return randSleep
}
func (g *Group) infof(format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
logger.Infof("group %q %s; interval=%v; eval_offset=%v; concurrency=%d",
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
}
// Replay performs group replay
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
var total int
step := g.Interval * time.Duration(maxDataPoint)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(replayDelay)
}
return total
}
// ExecOnce evaluates all the rules under group for once with given timestamp.
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
e := &executor{
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
if len(g.Rules) < 1 {
return nil
}
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}
// getResolveDuration returns the duration after which firing alert
// can be considered as resolved.
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
@@ -418,11 +576,49 @@ func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Du
return resolveDuration
}
func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
if g.EvalOffset != nil {
// calculate the min timestamp on the evaluationInterval
intervalStart := timestamp.Truncate(g.Interval)
ts := intervalStart.Add(*g.EvalOffset)
if timestamp.Before(ts) {
// if passed timestamp is before the expected evaluation offset,
// then we should adjust it to the previous evaluation round.
// E.g. request with evaluationInterval=1h and evaluationOffset=30m
// was evaluated at 11:20. Then the timestamp should be adjusted
// to 10:30, to the previous evaluationInterval.
return ts.Add(-g.Interval)
}
// when `eval_offset` is using, ts shouldn't be effect by `eval_alignment` and `eval_delay`
// since it should be always aligned.
return ts
}
timestamp = timestamp.Add(-g.getEvalDelay())
// always apply the alignment as a last step
if g.evalAlignment == nil || *g.evalAlignment {
// align query time with interval to get similar result with grafana when plotting time series.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
return timestamp.Truncate(g.Interval)
}
return timestamp
}
func (g *Group) getEvalDelay() time.Duration {
if g.EvalDelay != nil {
return *g.EvalDelay
}
return *evalDelay
}
// executor contains group's notify and rw configs
type executor struct {
notifiers func() []notifier.Notifier
Notifiers func() []notifier.Notifier
notifierHeaders map[string]string
rw *remotewrite.Client
Rw remotewrite.RWClient
previouslySentSeriesToRWMu sync.Mutex
// previouslySentSeriesToRW stores series sent to RW on previous iteration
@@ -432,6 +628,7 @@ type executor struct {
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
}
// execConcurrently executes rules concurrently if concurrency>1
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
res := make(chan error, len(rules))
if concurrency == 1 {
@@ -446,14 +643,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
sem := make(chan struct{}, concurrency)
go func() {
wg := sync.WaitGroup{}
for _, rule := range rules {
for _, r := range rules {
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, ts, resolveDuration, limit)
<-sem
wg.Done()
}(rule)
}(r)
}
wg.Wait()
close(res)
@@ -466,15 +663,12 @@ var (
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
execTotal.Inc()
tss, err := rule.Exec(ctx, ts, limit)
tss, err := r.exec(ctx, ts, limit)
if err != nil {
if errors.Is(err, context.Canceled) {
// the context can be cancelled on graceful shutdown
@@ -482,17 +676,15 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return nil
}
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
return fmt.Errorf("rule %q: failed to execute: %w", r, err)
}
if e.rw != nil {
if e.Rw != nil {
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
var lastErr error
for _, ts := range tss {
remoteWriteTotal.Inc()
if err := e.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err)
if err := e.Rw.Push(ts); err != nil {
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
}
}
return lastErr
@@ -501,13 +693,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return err
}
staleSeries := e.getStaleSeries(rule, tss, ts)
staleSeries := e.getStaleSeries(r, tss, ts)
if err := pushToRW(staleSeries); err != nil {
return err
}
}
ar, ok := rule.(*AlertingRule)
ar, ok := r.(*AlertingRule)
if !ok {
return nil
}
@@ -519,11 +711,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
wg := sync.WaitGroup{}
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers() {
for _, nt := range e.Notifiers() {
wg.Add(1)
go func(nt notifier.Notifier) {
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
}
wg.Done()
}(nt)
@@ -533,7 +725,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
}
// getStaledSeries checks whether there are stale series from previously sent ones.
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
for _, ts := range tss {
// convert labels to strings so we can compare with previously sent series
@@ -541,7 +733,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
ruleLabels[key] = ts.Labels
}
rID := rule.ID()
rID := r.ID()
var staleS []prompbmarshal.TimeSeries
// check whether there are series which disappeared and need to be marked as stale
e.previouslySentSeriesToRWMu.Lock()

View File

@@ -1,16 +1,22 @@
package main
package rule
import (
"context"
"fmt"
"math"
"os"
"reflect"
"sort"
"testing"
"time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
@@ -19,7 +25,15 @@ import (
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true
SkipRandSleepOnGroupStart = true
}
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
fmt.Println("failed to load template for test")
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUpdateWith(t *testing.T) {
@@ -35,18 +49,19 @@ func TestUpdateWith(t *testing.T) {
},
{
"update alerting rule",
[]config.Rule{{
Alert: "foo",
Expr: "up > 0",
For: promutils.NewDuration(time.Second),
Labels: map[string]string{
"bar": "baz",
[]config.Rule{
{
Alert: "foo",
Expr: "up > 0",
For: promutils.NewDuration(time.Second),
Labels: map[string]string{
"bar": "baz",
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
},
{
Alert: "bar",
Expr: "up > 0",
@@ -54,7 +69,8 @@ func TestUpdateWith(t *testing.T) {
Labels: map[string]string{
"bar": "baz",
},
}},
},
},
[]config.Rule{
{
Alert: "foo",
@@ -75,7 +91,8 @@ func TestUpdateWith(t *testing.T) {
Labels: map[string]string{
"bar": "baz",
},
}},
},
},
},
{
"update recording rule",
@@ -134,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"}
qb := &fakeQuerier{}
qb := &datasource.FakeQuerier{}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(qb, r))
@@ -166,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got)
}
if err := compareRules(t, got, want); err != nil {
if err := CompareRules(t, got, want); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
@@ -175,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
}
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
fs := &fakeQuerier{}
fn := &fakeNotifier{}
fs := &datasource.FakeQuerier{}
fn := &notifier.FakeNotifier{}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
const inst1, inst2, job = "foo", "bar", "baz"
@@ -200,7 +231,7 @@ func TestGroupStart(t *testing.T) {
alert1.State = notifier.StateFiring
// add external label
alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1
// add service labels
@@ -215,7 +246,7 @@ func TestGroupStart(t *testing.T) {
alert2.State = notifier.StateFiring
// add external label
alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2
// add service labels
@@ -224,40 +255,40 @@ func TestGroupStart(t *testing.T) {
alert2.ID = hash(alert2.Labels)
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
fs.Add(m1)
fs.Add(m2)
go func() {
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
close(finished)
}()
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts()
gotAlerts := fn.GetAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
gotAlertsNum := fn.getCounter()
gotAlertsNum := fn.GetCounter()
if gotAlertsNum < len(expectedAlerts)*2 {
t.Fatalf("expected to receive at least %d alerts; got %d instead",
len(expectedAlerts)*2, gotAlertsNum)
}
// reset previous data
fs.reset()
fs.Reset()
// and set only one datapoint for response
fs.add(m1)
fs.Add(m1)
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts()
gotAlerts = fn.GetAlerts()
alert2.State = notifier.StateInactive
expectedAlerts = []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
g.close()
g.Close()
<-finished
}
@@ -292,13 +323,13 @@ func TestGetStaleSeries(t *testing.T) {
e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
t.Helper()
var tss []prompbmarshal.TimeSeries
for _, l := range labels {
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
}
staleS := e.getStaleSeries(rule, tss, ts)
staleS := e.getStaleSeries(r, tss, ts)
if staleS == nil && expLabels == nil {
return
}
@@ -434,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
}
func TestFaultyNotifier(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := newTestAlertingRule("instant", 0)
r.q = fq
fn := &fakeNotifier{}
fn := &notifier.FakeNotifier{}
e := &executor{
notifiers: func() []notifier.Notifier {
Notifiers: func() []notifier.Notifier {
return []notifier.Notifier{
&faultyNotifier{},
&notifier.FaultyNotifier{},
fn,
}
},
@@ -460,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
tn := time.Now()
deadline := tn.Add(delay / 2)
for {
if fn.getCounter() > 0 {
if fn.GetCounter() > 0 {
return
}
if tn.After(deadline) {
@@ -473,17 +504,17 @@ func TestFaultyNotifier(t *testing.T) {
}
func TestFaultyRW(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := &RecordingRule{
Name: "test",
state: newRuleState(10),
q: fq,
state: &ruleState{entries: make([]StateEntry, 10)},
}
e := &executor{
rw: &remotewrite.Client{},
Rw: &remotewrite.Client{},
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
@@ -494,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
}
func TestCloseWithEvalInterruption(t *testing.T) {
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const delay = time.Second * 2
fq := &fakeQuerierWithDelay{delay: delay}
fq := &datasource.FakeQuerierWithDelay{Delay: delay}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fq, evalInterval, nil)
g := NewGroup(groups[0], fq, evalInterval, nil)
go g.start(context.Background(), nil, nil, nil)
go g.Start(context.Background(), nil, nil, nil)
time.Sleep(evalInterval * 20)
go func() {
g.close()
g.Close()
}()
deadline := time.Tick(delay / 2)
@@ -520,3 +566,225 @@ func TestCloseWithEvalInterruption(t *testing.T) {
case <-g.finishedCh:
}
}
func TestGroupStartDelay(t *testing.T) {
g := &Group{}
// interval of 5min and key generate a static delay of 30s
g.Interval = time.Minute * 5
key := uint64(math.MaxUint64 / 10)
f := func(atS, expS string) {
t.Helper()
at, err := time.Parse(time.RFC3339Nano, atS)
if err != nil {
t.Fatal(err)
}
expTS, err := time.Parse(time.RFC3339Nano, expS)
if err != nil {
t.Fatal(err)
}
delay := delayBeforeStart(at, key, g.Interval, g.EvalOffset)
gotStart := at.Add(delay)
if expTS != gotStart {
t.Errorf("expected to get %v; got %v instead", expTS, gotStart)
}
}
// test group without offset
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:00:30.000+00:00")
f("2023-01-01T00:00:00.999+00:00", "2023-01-01T00:00:30.000+00:00")
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:00:30.000+00:00")
f("2023-01-01T00:00:31.000+00:00", "2023-01-01T00:05:30.000+00:00")
// test group with offset smaller than above fixed randSleep,
// this way randSleep will always be enough
offset := 20 * time.Second
g.EvalOffset = &offset
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:00:30.000+00:00")
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:00:30.000+00:00")
f("2023-01-01T00:00:31.000+00:00", "2023-01-01T00:05:30.000+00:00")
// test group with offset bigger than above fixed randSleep,
// this way offset will be added to delay
offset = 3 * time.Minute
g.EvalOffset = &offset
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:03:30.000+00:00")
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:03:30.000+00:00")
f("2023-01-01T00:01:00.000+00:00", "2023-01-01T00:08:30.000+00:00")
f("2023-01-01T00:03:30.000+00:00", "2023-01-01T00:08:30.000+00:00")
f("2023-01-01T00:07:30.000+00:00", "2023-01-01T00:13:30.000+00:00")
offset = 10 * time.Minute
g.EvalOffset = &offset
// interval of 1h and key generate a static delay of 6m
g.Interval = time.Hour
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:16:00.000+00:00")
f("2023-01-01T00:05:00.000+00:00", "2023-01-01T00:16:00.000+00:00")
f("2023-01-01T00:30:00.000+00:00", "2023-01-01T01:16:00.000+00:00")
}
func TestGetPrometheusReqTimestamp(t *testing.T) {
offset := 30 * time.Minute
evalDelay := 1 * time.Minute
disableAlign := false
testCases := []struct {
name string
g *Group
originTS, expTS string
}{
{
"with query align + default evalDelay",
&Group{
Interval: time.Hour,
},
"2023-08-28T11:11:00+00:00",
"2023-08-28T11:00:00+00:00",
},
{
"without query align + default evalDelay",
&Group{
Interval: time.Hour,
evalAlignment: &disableAlign,
},
"2023-08-28T11:11:00+00:00",
"2023-08-28T11:10:30+00:00",
},
{
"with eval_offset, find previous offset point + default evalDelay",
&Group{
EvalOffset: &offset,
Interval: time.Hour,
},
"2023-08-28T11:11:00+00:00",
"2023-08-28T10:30:00+00:00",
},
{
"with eval_offset + default evalDelay",
&Group{
EvalOffset: &offset,
Interval: time.Hour,
},
"2023-08-28T11:41:00+00:00",
"2023-08-28T11:30:00+00:00",
},
{
"1h interval with eval_delay",
&Group{
EvalDelay: &evalDelay,
Interval: time.Hour,
},
"2023-08-28T11:41:00+00:00",
"2023-08-28T11:00:00+00:00",
},
{
"1m interval with eval_delay",
&Group{
EvalDelay: &evalDelay,
Interval: time.Minute,
},
"2023-08-28T11:41:13+00:00",
"2023-08-28T11:40:00+00:00",
},
{
"disable alignment with eval_delay",
&Group{
EvalDelay: &evalDelay,
Interval: time.Hour,
evalAlignment: &disableAlign,
},
"2023-08-28T11:41:00+00:00",
"2023-08-28T11:40:00+00:00",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
originT, _ := time.Parse(time.RFC3339, tc.originTS)
expT, _ := time.Parse(time.RFC3339, tc.expTS)
gotTS := tc.g.adjustReqTimestamp(originT)
if !gotTS.Equal(expT) {
t.Fatalf("get wrong prometheus request timestamp, expect %s, got %s", expT, gotTS)
}
})
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View File

@@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
// NewRecordingRule creates a new RecordingRule
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
Type: group.Type,
RuleID: cfg.ID,
@@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
}),
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
rr.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
rr.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
rr.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
e := rr.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := rr.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
return rr
}
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
// close unregisters rule metrics
func (rr *RecordingRule) close() {
rr.metrics.errors.Unregister()
rr.metrics.samples.Unregister()
}
// ExecRange executes recording rule on the given time range similarly to Exec.
// execRange executes recording rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil {
return nil, err
@@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
return tss, nil
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
// exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Curl: requestToCurl(req),
}
defer func() {
@@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
}()
if err != nil {
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.Err
}
qMetrics := res.Data
numSeries := len(qMetrics)
if limit > 0 && numSeries > limit {
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.Err
}
duplicates := make(map[string]struct{}, len(qMetrics))
@@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
ts := rr.toTimeSeries(r)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.Err
}
duplicates[key] = struct{}{}
tss = append(tss, ts)
@@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
return newTimeSeries(m.Values, m.Timestamps, labels)
}
// UpdateWith copies all significant fields.
func (rr *RecordingRule) UpdateWith(r Rule) error {
// updateWith copies all significant fields.
func (rr *RecordingRule) updateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
@@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
rr.q = nr.q
return nil
}
// ToAPI returns Rule's representation in form
// of APIRule
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
return r
}

View File

@@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
Name: "job:foo",
Labels: map[string]string{
"source": "test",
}},
},
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tc.rule.state = newRuleState(10)
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0)
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
}
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
"source": "test_limit",
}}
var err error
for _, testCase := range testCases {
fq := &fakeQuerier{}
fq.add(testMetrics...)
fq := &datasource.FakeQuerier{}
fq.Add(testMetrics...)
rule.q = fq
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit)
_, err = rule.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
@@ -215,18 +218,17 @@ func TestRecordingRuleLimit(t *testing.T) {
func TestRecordingRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{
Name: "job:foo",
state: newRuleState(10),
Name: "job:foo",
Labels: map[string]string{
"job": "test",
},
state: &ruleState{entries: make([]StateEntry, 10)},
}
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
fq.SetErr(errors.New(expErr))
rr.q = fq
_, err := rr.Exec(context.TODO(), time.Now(), 0)
_, err := rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
fq.reset()
fq.Reset()
// add metrics which differs only by `job` label
// which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), time.Now(), 0)
_, err = rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}

174
app/vmalert/rule/rule.go Normal file
View File

@@ -0,0 +1,174 @@
package rule
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// execRange executes the rule on the given time range.
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// updateWith performs modification of current Rule
// with fields of the given Rule.
updateWith(Rule) error
// close performs the shutdown procedures for rule
// such as metrics unregister
close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []StateEntry
cur int
}
// StateEntry stores rule's execution states
type StateEntry struct {
// stores last moment of time rule.Exec was called
Time time.Time
// stores the timesteamp with which rule.Exec was called
At time.Time
// stores the duration of the last rule.Exec call
Duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
Err error
// stores the number of samples returned during
// the last evaluation
Samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
SeriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
Curl string
}
// GetLastEntry returns latest stateEntry of rule
func GetLastEntry(r Rule) StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getLast()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getLast()
}
return StateEntry{}
}
// GetRuleStateSize returns size of rule stateEntry
func GetRuleStateSize(r Rule) int {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.size()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.size()
}
return 0
}
// GetAllRuleState returns rule entire stateEntries
func GetAllRuleState(r Rule) []StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getAll()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getAll()
}
return []StateEntry{}
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getLast() StateEntry {
s.RLock()
defer s.RUnlock()
if len(s.entries) == 0 {
return StateEntry{}
}
return s.entries[s.cur]
}
func (s *ruleState) getAll() []StateEntry {
entries := make([]StateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.Time.IsZero() || !e.At.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e StateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < replayRuleRetryAttempts; i++ {
tss, err = r.execRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %w", err)
}
n += len(ts.Samples)
}
return n, nil
}

Some files were not shown because too many files have changed in this diff Show More