Compare commits

...

386 Commits

Author SHA1 Message Date
Max Golionko
7226242070 CI: disable docker scan, enable auto release to sandbox (#4476)
* disable docker scan

* disable nightly, enable auto release to sandbox

* remove whitespace

(cherry picked from commit d4099a75be)
2023-06-30 13:47:23 +02:00
f41gh7
db84353809 docs/CHANGELOG.md: v1.91.3 release 2023-06-30 12:26:08 +02:00
Alexander Marshalov
fd6f50f883 show backup progress percentage in vmbackup log during backup uploading and restoring progress percentage in vmrestore log during backup downloading (#4460) (#4530)
Signed-off-by: Alexander Marshalov <_@marshalov.org>

(cherry picked from commit 1cc06e39cd)
2023-06-30 12:23:25 +02:00
Haleygo
8948af219b vmalert: add vmalert_remotewrite_sent_duration_seconds_total metric (#4517)
add `vmalert_remotewrite_sent_duration_seconds_total` metric

(cherry picked from commit a97887a2d9)
2023-06-30 12:23:07 +02:00
Roman Khavronenko
21a6584367 vmalert: make linter happy (#4509)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit 37c9a631ca)
2023-06-30 12:22:53 +02:00
Roman Khavronenko
facba87494 vmalert: update retry policy for pushing data to -remoteWrite.url (#4504)
By default, vmalert will make multiple retry attempts with exponential delay.
The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s).
When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`.
Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable).

See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>

(cherry picked from commit 5f9ad22884)
2023-06-30 12:22:50 +02:00
Roman Khavronenko
a11badbbfd vmalert: properly interrupt remotewrite retries on shutdown (#4505)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit 4aad7a43df)
2023-06-30 12:22:43 +02:00
Roman Khavronenko
79c779b24b vmalert: retry all errors except 4XX status codes (#4461)
vmalert: retry all errors except 4XX status codes

Retry all errors except 4XX status codes while pushing via remote-write
to the remote storage. Previously, errors like broken connection could
prevent vmalert from retrying the request.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit 79a5499cb2)
2023-06-30 12:21:36 +02:00
Yury Molodov
0c85e5f351 vmui: memory leak fix (#4455)
* fix: optimize the preparation of data for the graph

* fix: optimize tooltip rendering

* fix: optimize re-rendering of the chart

* vmui: memory leak fix

(cherry picked from commit 66b42a6772)
2023-06-30 12:21:33 +02:00
Nikolay
b250c3d7fb lib/storage: creates parts.json on start-up if it not exists. (#4450)
* lib/storage: creates parts.json on start-up if it not exists.
It fixes migrations from versions below v1.90.0.
Previously parts.json was created only after successful merge.
But if merge was interruped for some reason (OOM or shutdown), parts.json wasn't created and partitions left after interruped merge weren't properly deleted.
Since VM cannot check if it must be removed or not.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4336

* Apply suggestions from code review

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

* Update lib/storage/partition.go

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

---------

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
(cherry picked from commit 5eb5df96e2)
2023-06-30 12:21:06 +02:00
Dmytro Kozlov
ac95755a7a vmctl: finish retries if context canceled (#4442)
vmctl: interrupt backoff retries if import context is cancelled

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

(cherry picked from commit ddb3ae0f00)
2023-06-30 12:20:53 +02:00
Alexander Marshalov
5381e97cdd fixed service name detection for consulagent service discovery in case of a difference in service name and service id (#4390) (#4439)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
(cherry picked from commit 40d12be607)
2023-06-30 12:20:42 +02:00
Roman Khavronenko
02fa89aed4 all: update Go builder from Go1.20.4 to Go1.20.5 (#4427)
See https://github.com/golang/go/issues?q=milestone%3AGo1.20.5+label%3ACherryPickApproved

Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit 476c7bdd6f)
2023-06-30 12:20:36 +02:00
Roman Khavronenko
a7d24463ef lib/promscrape/discoveryutils: properly check for net.ErrClosed (#4426)
This error may be wrapped in another error, and should normally be tested using
`errors.Is(err, net.ErrClosed)`.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit dfe53a36fc)
2023-06-30 12:20:30 +02:00
Zakhar Bessarab
ea333465e7 doc: changelog followup for #4420 fix (#4421)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
(cherry picked from commit 9a490d0b5c)
2023-06-30 12:20:25 +02:00
Zakhar Bessarab
d0efb06250 app/vmagent/remotewrite: fix vmagent panic on shutdown (#4407)
app/vmagent/remotewrite: fix vmagent panic on shutdown

Currently, when vmagent is stopping it first flushes pending series in remote write context and proceeds to stop streaming aggregation. This leads to streaming aggregation being unable to write results into pending timeseries (since it is already nil) and panic.
This can lead to losing some aggregation results being lost almost silently.

The fix is reordering flow to first stop streaming aggregation and flush all pending time series after that.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

(cherry picked from commit ce7141383d)
2023-06-30 12:20:18 +02:00
Roman Khavronenko
de94812088 vmalert: fix nil map assignment (#4392)
* vmalert: fix nil map assignment

The storage instance with nil map params was created for remote-read purposes.
And before change 7a9ae9de0d this map was ignored in ApplyParams.
Now, it started to be used and vmalert panics in runtime.

The fix properly inits map for at `NewVMStorage` and verifies it is not nil
on assignment in `ApplyParams`.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: add to changelog

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: properly clone Storage params

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: properly clone Storage params

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: properly clone Storage params

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-02 11:38:55 +02:00
Roman Khavronenko
2c664a6d12 deployment/docker: update VictoriaMetrics version from v1.91.0 to v1.91.1 in docker compose files (#4387)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-02 07:55:55 +02:00
Roman Khavronenko
b771152039 docs/CHANGELOG.md: cut v1.91.1 (#4386)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-02 07:50:24 +02:00
Nikolay
2c876227e4 docs/changlelog: mention 6c84b61 (#4384) 2023-06-01 13:45:12 +02:00
Roman Khavronenko
eccecdf177 app/vmalert: follow-up after 7a9ae9de0d (#4381)
7a9ae9de0d

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-01 11:38:48 +02:00
Roman Khavronenko
4b5faf7efb docs: mention fix for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4221 (#4382)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4221

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-01 11:30:42 +02:00
Dmytro Kozlov
9843ec0e1d app/vmui: fix behavior when changing url in global settings (#4332)
* app/vmui: fix behavior when changing url in global settings

* app/vmctl: minor fix

* app/vmui: fix behavior when changing url in global settings
2023-06-01 12:19:03 +03:00
Nikolay
a0bf8f233f docs: mention recent changes at changelog (#4379) 2023-06-01 10:57:32 +02:00
Roman Khavronenko
8185c2466c docs: clarify deduplication docs (#4371)
The purpose of the change is too highlight what HA pair is
and how deduplication needs identical labels to be present
in raw samples.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4367

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-06-01 10:28:21 +02:00
gsakun
20dc3db71e app/vmalert: fix datasource.roundDigits Parameter (#4341)
app/vmalert: fix querybuild clone and extraParams merge logic

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4340
2023-06-01 09:44:11 +02:00
Denys Holius
0bca7d49b9 docs/Quick-Start.md: adds missed command to 'Starting VM-Cluster via Docker' section (#4375) 2023-05-31 16:39:38 +02:00
Nikolay
f263031fe9 app/vmauth: properly handle LOCAL proxy protocol command (#4373)
app/vmauth: properly handle LOCAL proxy protocol command

It is required for handling health checks from load balancers

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3335
2023-05-31 15:37:59 +02:00
Roman Khavronenko
a0c040ea58 docs: mention disk space requirements for downsampling (#4369)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-31 11:08:18 +02:00
Roman Khavronenko
1b24f4729b vmalert: mention default value for external.url flag (#4365)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-30 12:33:17 +02:00
Artem Navoiev
a8c58fc145 docs: fix enterprise page title to match with official name
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-29 22:39:56 +02:00
Artem Navoiev
5b0e8c797c docs: fix markdown headers ordering
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-29 22:35:05 +02:00
Artem Navoiev
83c1944184 docs: fix markdown for title
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-29 22:22:25 +02:00
Artem Navoiev
6967c3cb95 fix image path
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-29 07:11:15 -07:00
Artem Navoiev
8a4c89ea22 docs: change images from markdown tag to html for migration
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-29 07:11:15 -07:00
Roman Khavronenko
51cea6cad4 vmalert: properly form assets address if httpPrefix set (#4351)
Properly form path to static assets in WEB UI
 if `http.pathPrefix` set.

 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4349

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-29 07:38:13 +02:00
Roman Khavronenko
1c3f50f791 docs: mention multi-tenancy in docs (#4357)
The update should make understanding of multi-tenancy more clear
for influxdb users.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-29 07:37:23 +02:00
Artem Navoiev
bf4711ecba desribe old link param
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-26 01:23:04 -07:00
Artem Navoiev
8f09569cb8 return information about cluster
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-26 01:23:04 -07:00
Artem Navoiev
f791811b15 update docs-sync Makefile command, add hugo front-matter
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-26 01:23:04 -07:00
Haleygo
b3d0ff463a vmagent:support follow_redirects on SD level (#4286)
* vmagent:support follow_redirects on SD level

* fix follow_redirects on sd level

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4282
2023-05-26 09:39:45 +02:00
Nikolay
228ea03bda app/vmselect/graphite: fixes tests for arm (#4348)
at arm based CPUs only 9 digits after comma matches for tests.
Especially at holtWinters functions. Since it only takes effect at tests
it makes no sense for changing float prescision at actual functions
2023-05-26 09:34:15 +02:00
Artem Navoiev
434358b551 remove absolute links, fix aliase
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-25 14:35:21 -07:00
Artem Navoiev
c1ad48edf9 fix title in multi-regional-setup-dedicated-regions.md
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-25 23:14:29 +02:00
Dmytro Kozlov
4060f3f261 app/vmctl: fix tests (#4345) 2023-05-25 17:04:04 +02:00
Roman Khavronenko
66ed6fe62f vmalert: do not return nil rules for /api/v1/rules (#4344)
The fix addresses a case when vmalert is configured with a group
which has `name`, but doesn't have `rules` configured. In this
case it still returns a `nil` instead of `[]` slice.

Fixing this via current commit.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4221

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-25 16:56:54 +02:00
Roman Khavronenko
76f7e66d8e vmalert: fix the typo in popup (#4331)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-19 12:45:15 +02:00
Aliaksandr Valialkin
1f2f74e70e lib/promrelabel: use monospace font at textarea for writing relabel configs on /metric-relabel-debug and /target-relabel-debug pages
This simplifies visual inspection of indentation in yaml configs
2023-05-18 20:48:41 -07:00
Aliaksandr Valialkin
2b53ff774b app/vmselect: log locations of sendPrometheusError() calls
Previously the location inside the sendPrometheusError() was logged.
This could make hard investigating error locations via `vm_log_messages_total` metric.
2023-05-18 20:39:53 -07:00
Aliaksandr Valialkin
388ffec262 docs/CHANGELOG.md: document v1.79.13 LTS release 2023-05-18 19:56:28 -07:00
Aliaksandr Valialkin
9e21315315 docs/CHANGELOG.md: document v1.87.6 LTS release 2023-05-18 17:50:12 -07:00
Aliaksandr Valialkin
9fa052c302 deployment/docker: update VictoriaMetrics version from v1.90.0 to v1.91.0 in docker compose files 2023-05-18 15:34:21 -07:00
Aliaksandr Valialkin
f0ed9ab4f2 docs/Release-Guide.md: fix a link to sandbox environment
The link became broken after 5f9d3f9cb5
2023-05-18 15:30:07 -07:00
Aliaksandr Valialkin
73dfb030dd deployment/docker/Makefile: use alpine 3.17.3 instead of alpine 3.18.0 for certs image, since alpine 3.18.0 doesnt work for cross-platform builds 2023-05-18 14:10:29 -07:00
Aliaksandr Valialkin
52b5498165 docs/CHANGELOG.md: cut v1.91.0 2023-05-18 12:37:12 -07:00
Aliaksandr Valialkin
b6dda0fefe vendor: make vendor-update 2023-05-18 12:22:09 -07:00
Aliaksandr Valialkin
d9b3a92348 app/vmselect/vmui: run make vmui-update after 39c1b0f8d1 2023-05-18 12:15:12 -07:00
Aliaksandr Valialkin
1f28b46ae9 lib/storage: revert the migration from global to per-day index for (MetricName -> TSID)
This reverts the following commits:
- e0e16a2d36
- 2ce02a7fe6

The reason for revert: the updated logic breaks assumptions made
when fixing https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2698 .
For example, if a time series stop receiving new samples during the first
day after the indexdb rotation, there are chances that the time series
won't be registered in the new indexdb. This is OK until the next indexdb
rotation, since the time series is registered in the previous indexdb,
so it can be found during queries. But the time series will become invisible
for search after the next indexdb rotation, while its data is still there.

There is also incompletely solved issue with the increased CPU and disk IO resource
usage just after the indexdb rotation. There was an attempt to fix it, but it didn't fix
it in full, while introducing the issue mentioned above. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1401

TODO: to find out the solution, which simultaneously solves the following issues:
- increased memory usage for setups high churn rate and long retention (e.g. what the reverted commit does)
- increased CPU and disk IO usage during indexdb rotation ( https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1401 )
- https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2698

Possible solution - to create the new indexdb in one hour before the indexdb rotation
and to gradually pre-populate it with the needed index data during the last hour before indexdb rotation.
Then the new indexdb will contain all the needed data just after the rotation,
so it won't trigger increased CPU and disk IO.
2023-05-18 11:30:49 -07:00
Aliaksandr Valialkin
4f7f750850 lib/handshake: do not pollute logs with cannot read hello messages on TCP health checks
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1762
2023-05-18 10:41:34 -07:00
Aliaksandr Valialkin
0645688f32 app/vmauth: allow -auth.config without users section of unauthorized_user section is present here
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4083
2023-05-18 10:41:34 -07:00
Dmytro Kozlov
7cbda6796c app/vmctl: set default value for --vm-native-step-interval flag (#4327)
* app/vmctl: set default value for `--vm-native-step-interval` flag

* app/vmctl: update CHANGELOG.md

* app/vmctl: update CHANGELOG.md, fix docs

* app/vmctl: fix typo

* app/vmctl: fix typo
2023-05-18 13:43:35 +02:00
Haleygo
1531d757ea fix lint check 2023-05-17 13:51:36 +02:00
Denys Holius
c605d64a95 deployment/docker/Makefile: updated docker compose commands regarding migration from V1 to V2 (#4314)
deployment/docker/Makefile: updated docker compose commands regarding migration from V1 to V2
2023-05-17 13:14:24 +02:00
Aliaksandr Valialkin
63b1cab454 app/vmauth: simplify the code after 4a1d29126c
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4242
2023-05-17 00:37:05 -07:00
Nikolay
4a1d29126c app/vmauth: retry common network dial errors (#4280)
with tracking request body read calls
it allows us to retry POST and PUT requests
2023-05-17 00:19:33 -07:00
Nikolay
16df18ec14 app/vmauth: do not return invalid credentials (#4288)
at http response by default
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4188

based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4190
Thanks @raj-kumar-j  for init implementation
2023-05-17 00:09:47 -07:00
Aliaksandr Valialkin
e0e16a2d36 lib/storage: follow-up after 2ce02a7fe6
- Document the change at docs/CHANGELOG.md
- Clarify comments for non-trivial code touched by the commit
- Improve the logic behind maybeCreateIndexes():
  - Correctly create per-day indexes if the indexdb rotation is performed during
    the first hour or the last hour of the day by UTC.
    Previously there was a possibility of missing index entries on that day.
  - Increase the duration for creating new indexes in the current indexdb for up to 22 hours
    after indexdb rotation. This should reduce the increased resource usage
    after indexdb rotation.
    It is safe to postpone index creation for the current day until the last hour
    of the current day after indexdb rotation by UTC, since the corresponding (date, ...)
    entries exist in the previous indexdb.
- Search for TSID by (date, MetricName) in both the current and the previous indexdb.
  Previously the search was performed only in the current indexdb. This could lead
  to excess creation of per-day indexes for the current day just after indexdb rotation.
- Search for (date, metricID) entries in both the current and the previous indexdb.
  Previously the search was performed only in the current indexdb. This could lead
  to excess creation of per-day indexes for the current day just after indexdb rotation.
2023-05-16 23:19:27 -07:00
Roman Khavronenko
2ce02a7fe6 lib/storage: introduce per-day MetricName=>TSID index (#4252)
The new index substitutes global MetricName=>TSID index
used for locating TSIDs on ingestion path.
For installations with high ingestion and churn rate, global
MetricName=>TSID index can grow enormously making
index lookups too expensive. This also results into bigger
than expected cache growth for indexdb blocks.

New per-day index supposed to be much smaller and more efficient.
This should improve ingestion speed and reliability during
re-routings in cluster.

The negative outcome could be occupied disk size, since
per-day index is more expensive comparing to global index.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-16 15:46:42 -07:00
Artem Navoiev
39ba4fc1c4 update logo width in cluster doc to 300
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-16 15:33:15 -07:00
Aliaksandr Valialkin
278278af95 lib/storage: reduce the unimportant logging during Storage start / stop
This should improve the visibility of potentially important logs
2023-05-16 15:14:21 -07:00
Aliaksandr Valialkin
d330c7e6fc lib/mergeset: remove superflouos logging when opening and closing the Table
The logged messages had little useful info, while they were polluting log output during VictoriaMetrics start/stop
2023-05-16 15:01:25 -07:00
Aliaksandr Valialkin
3cbc0975f6 lib/mergeset: close and open the table before making snapshots at TestTableCreateSnapshotAt()
This gives guarantees that all the in-memory data is written to disk at the snapshot time.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4272
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4316
2023-05-16 14:55:11 -07:00
Aliaksandr Valialkin
09b403d38a lib/{mergeset,storage}: make it clear that DebugFlush() doesn't store all the recently ingested data to disk
DebugFlush() makes sure that the recently ingested data becomes visible to search.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4272
2023-05-16 11:50:17 -07:00
Aliaksandr Valialkin
664db964ca vendor: update github.com/VictoriaMetrics/metrics from v1.23.1 to v1.24.0
This change adds process_* metrics to VictoriaMetrics components under Windows OS

See https://github.com/VictoriaMetrics/metrics/pull/47
2023-05-16 11:37:07 -07:00
Aliaksandr Valialkin
60bdbdaa70 docs/vmbackupmanager.md: run make docs-sync after c7d8dda39225b716ea44df7223db5e4a125d407b 2023-05-16 11:27:42 -07:00
Alexander Marshalov
3b2dc2b098 backup metadata are written in separate file (#560)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-16 11:24:54 -07:00
Alexander Marshalov
7b15834cbe added backup locking/unlocking against retention policy to vmbackupmanager (#558)
* added backup locking/unlocking against retention policy to vmbackupmanager

Signed-off-by: Alexander Marshalov <_@marshalov.org>

* added docs for new commands

Signed-off-by: Alexander Marshalov <_@marshalov.org>

* fix review comments

Signed-off-by: Alexander Marshalov <_@marshalov.org>

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-16 11:23:36 -07:00
Roman Khavronenko
f68d93cca2 vmalert: follow-up after 669becd011 (#4318)
* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-16 18:51:38 +02:00
Zakhar Bessarab
242050ba94 lib/storage: follow-up after a50d63c376 (#4289)
* lib/storage: follow-up after a50d63c376

- ensure retentionMsecs is rounded to day
- remove localTimeOffset in test as localOffset is ignored when using `UnixMilli`

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/storage: restore retention timezone offset effect on retention deadline

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-05-16 17:14:08 +02:00
Michael Hoffmann
3a65f4a733 vmalert: improve retry logic for remote write (#4134)
vmalert should not retry on 4xx status codes
according to https://prometheus.io/docs/concepts/remote_write_spec/
2023-05-16 16:30:03 +02:00
Yury Molodov
39c1b0f8d1 vmui: refactor code using custom hooks (#4145)
* refactor: replace boolean useState with useBoolean

* refactor: replace useResize with useWindowSize/useElementSize

* refactor: replace addEventListener with useEventListener

* refactor: replace navigator.clipboard.writeText with useCopyToClipboard

* fix: prevent redirect loop
2023-05-16 16:41:06 +03:00
Roman Khavronenko
686bf6c5ad vmctl: update VictoriaMetrics migration section (#4310)
Remove unnecessary information to simplify the description and tips.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-15 16:15:01 +02:00
Artem Navoiev
478258bc4d fix link in operator quick start docs.2
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-14 21:20:57 +02:00
Artem Navoiev
c19fc52676 fix link in operator quick start docs
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-14 19:54:08 +02:00
Aliaksandr Valialkin
1c47acda11 lib/promutils: add ParseTimeAt() function 2023-05-13 20:12:31 -07:00
Aliaksandr Valialkin
2613110f75 deployment/docker: update base docker image from 3.17.3 to 3.18.0
See https://www.alpinelinux.org/posts/Alpine-3.18.0-released.html
2023-05-12 17:31:21 -07:00
Aliaksandr Valialkin
616175b1ce lib/promutils: properly return error when incorrect Prometheus label names are passed to NewLabelsFromString()
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4284
See also https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4304
2023-05-12 16:52:29 -07:00
Aliaksandr Valialkin
318a87c36f Revert "lib/promrelabel: show error message if labels not in prometheus exposition format (#4304)"
This reverts commit 193a9c3328.

Reason for revert: the commit doesn't fix the real issue with promutils.NewLabelsFromString()
function, which must return error when improperly formatted Prometheus metric with labels is passed to it.
See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-format-example

E.g. the promutils.NewLabelsFromString() must return error when the following strings are passed to it:

- `{foo:"bar"}`, since `:` is disallowed in Prometheus text exposition format. The corect value is `{foo="bar"}`
- `{"foo":"bar"}`, since label name shouldn't be quoted. The correct value is `{foo="bar"}`.

The reverted commit introduces another set of bugs, which happily accept the following invalid input:

- `{foo=~"bar"}`
- `{foo!="bar"}`
- `{foo!~"bar"}`

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4284
See also https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4304
2023-05-12 16:07:37 -07:00
Aliaksandr Valialkin
160453b86c lib/protoparser/csvimport: properly parse the last empty column in CSV line
Do not ignore the last empty column in CSV line.
While at it, properly parse CSV columns in single quotes, e.g. `'foo,bar',baz` is parsed as two columns - `foo,bar` and `baz`

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4048

See also https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4298
2023-05-12 15:51:41 -07:00
Aliaksandr Valialkin
b7fe7b801c Revert "lib/protoparser: fix skip csv line when metric can be collect from the line (#4298)"
This reverts commit 410ae99c2e.

Reason for revert: the commit masks the real issue instead of fixing it.
The real issue is that the scanner.NextColumn() skips the last column if it is empty.

The commit also introduces two bugs:

- a panic if all the metric values in CSV line are empty
- silent import of CSV lines with too small number of columns

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4048
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4298
2023-05-12 15:22:27 -07:00
Yury Molodov
f0fad01e8a vmui: add notification for non-matching queries (#4301)
vmui: add notification for non-matching queries (#4211)

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4211
2023-05-12 13:55:47 +02:00
Roman Khavronenko
5a7159ab2e docs: mention link to public relabeling playground (#4306)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-12 10:58:22 +02:00
Roman Khavronenko
ad2d079ba5 docs: update docs about VMUI pages (#4305)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-12 10:51:44 +02:00
Dmytro Kozlov
193a9c3328 lib/promrelabel: show error message if labels not in prometheus exposition format (#4304)
lib/promrelabel: show error message if labels not in prometheus exposition format

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4284
2023-05-12 10:42:56 +02:00
Dmytro Kozlov
410ae99c2e lib/protoparser: fix skip csv line when metric can be collect from the line (#4298)
* lib/protoparser: fix skip csv line when metric can be collect from the line

* lib/protoparser: fix comment
2023-05-12 11:04:16 +03:00
Yury Molodov
1e4a9a8dfe vmui: enhancements to top queries page (#4299)
* feat: improvement of the top queries page

* vmui/docs: enhancements to top queries page

* Apply suggestions from code review

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-05-11 13:47:32 -07:00
Alexander Marshalov
9855b38da2 fixed error with double slash in vmbackupmanager (#557)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-11 13:38:07 -07:00
Aliaksandr Valialkin
5d22c36904 docs/CHANGELOG.md: improve the description of the change at 7ea2531db0
Move the change description to the group of vmui changes.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4213
2023-05-11 13:30:33 -07:00
Aliaksandr Valialkin
73812c71a5 lib/promutils: properly parse time strings with timezones at ParseTime() 2023-05-11 13:24:00 -07:00
Roman Khavronenko
adc5635a07 vmalert: add hints to filter buttons (#4296)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-11 16:38:08 +02:00
Denys Holius
bb7a295146 app/vmctl/vm_native.go: fixed a typo in error message 2023-05-11 16:36:13 +02:00
Yury Molodov
a55d3f6882 vmui: increase font-size and fix the text display (#4273)
vmui: change default font size to 14px for better readability
vmui: fix bug with missing text on buttons in safari

---------

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-05-11 14:50:09 +02:00
Dmytro Kozlov
7ea2531db0 app/vmui: added table where Labels with the highest number of unique values show (#4271)
* app/vmui: added Labels with the highest number of unique values

* app/vmui: cleanup

* app/vmui: cleanup

* app/vmui: add table description

* app/vmui: fix comment, updated CHANGELOG.md

* app/vmui: disable links

* app/vmui: added actions to the table, it will show values for selected label with the highest number of series

* app/vmui: fix comment
2023-05-11 15:19:36 +03:00
Aliaksandr Valialkin
da037cafc5 lib/bytesutil: go fmt after 2ec17bed2c 2023-05-10 20:29:03 -07:00
Aliaksandr Valialkin
86424e079e docs/CHANGELOG.md: fix typo after 2caf0b05c6 2023-05-10 13:03:20 -07:00
Aliaksandr Valialkin
2ec17bed2c lib/bytesutil: add benchmarks for ToUnsafeString() and ToUnsafeBytes() 2023-05-10 12:59:26 -07:00
Artem Navoiev
0ccb7f51dd fix typo in changelog
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-10 19:23:24 +02:00
Roman Khavronenko
2caf0b05c6 vmalert: correctly update seriesFetched metric for const exprs (#4287)
Previously, metric `vmalert_alerting_rules_last_evaluation_series_fetched`
would be set to 0 for const expressions, because const expression do not match
any series. This may result into a confusion: no series were matched but response isn't empty.
The change updates the logic behind metric: if no series were matched but there are samples
in response - use amount of samples as number of series.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-10 15:04:05 +02:00
Roman Khavronenko
51196739af Vmalert UI updates (#4276)
* vmalert: expand rule groups on anchor click

before, anchor click was only updating the URL.
To expand the group, user had to click on rule's block.
Now, group will toggle automatically.

* vmalert: allow filtering group in web UI

The new filter allows to filter groups and rules within
groups by: errors only or noMatch only.

The filtering supposed to help navigating big numbers of groups/rules.
Filtering is reflected in URL, so can be shared as a link.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-10 14:38:13 +02:00
Aliaksandr Valialkin
c8799a5d97 vendor: update github.com/valyala/gozstd from v1.19.1 to v1.20.1 2023-05-10 02:16:08 -07:00
Dmytro Kozlov
060a0cdbf4 docs: Add information about datasource plugin (#4266)
docs: Add information about datasource plugin
2023-05-10 10:26:57 +02:00
Alexander Marshalov
2e494e2375 fixed typos in documentation and commandline flags descriptions (#4275) 2023-05-10 09:50:41 +02:00
Aliaksandr Valialkin
9eb1abdefe vendor: make vendor-update 2023-05-09 23:13:50 -07:00
Aliaksandr Valialkin
4e0345a5ef docs/CHANGELOG.md: add a link to docs about never-firing alerts
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039
2023-05-09 21:58:36 -07:00
Aliaksandr Valialkin
78b23c9a83 docs/CHANGELOG.md: document 8f4de6fa47 2023-05-09 21:39:25 -07:00
Roman Khavronenko
491831df49 vminsert: properly reset labels object on aggregation (#4278)
Without reset, labels duplicates could have been added during stream aggregation.
Since `ctx.Labels` is reused during processing of many series, each series will
add its labels to the context. Even if the same labels were already addeded on prev
iteration. Now, we reset `ctx.Labels` on each iteration to contain so labels from
different series didn't interfere.

This could have cause exceeding of the limit on number of labels per pushed time series.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4277

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-09 08:33:58 -07:00
Aliaksandr Valialkin
b9bb64ce55 lib/promscrape/discovery/consulagent: substitute metaPrefix with the __meta_consulagent_ plaintext string
This simplifies future code navigation and search for the specific meta-label starting from __meta_consulagent_* prefix.
For example, `grep __meta_consulagent_namespace` finds the exact place where this label is defined.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3953
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4217
2023-05-08 23:40:13 -07:00
Aliaksandr Valialkin
7db647e924 lib/fs: move common code outside arch-specific implementations of mustRemoveDirAtomic()
This is a follow-up for 73b6c23271
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-05-08 23:10:20 -07:00
Aliaksandr Valialkin
83a1c2484c docs/CHANGELOG.md: group changelog lines for tip release according to VictoriaMetrics apps 2023-05-08 22:57:23 -07:00
Aliaksandr Valialkin
11eb94d3bc docs/CHANGELOG.md: document baf456978d
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4223
2023-05-08 22:26:04 -07:00
Aliaksandr Valialkin
ff3c90d305 docs/managed-victoriametrics/overview.md: typo fix after 51fbf58d89 2023-05-08 21:58:40 -07:00
Aliaksandr Valialkin
fc42617ecd docs/CHANGELOG.md: refer to the author and the pull request of the notifier_headers feature at vmalert
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3260
2023-05-08 17:18:48 -07:00
Aliaksandr Valialkin
887555669e Revert "lib/streamaggr: discard samples with timestamps outside of aggregation interval (#4199)"
This reverts commit 9e99f2f5b3.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4068

Reason for revert: this breaks valid use cases:

- If timestamps aren't specified in the incoming samples on purpose. For example, if stream aggregation is used
  as StatsD replacement. StatsD protocol has no timestamp concept for incoming samples.
  See https://github.com/b/statsd_spec

- If all the samples must be aggregated, even if they contain stale timestamps.
  for example, if the stream aggregation produces some counter of some events,
  it may be better to count all the events even if they were delayed before
  being ingested into VictoriaMetrics.

Is is also unclear how to determine whether the sample becomes stale.
For example, if the aggregation interval equals to 1h, and the previous
aggregation cycle just finished 10 minutes ago, what to do with the newly
incoming sample with the timestamp 30 minutes older than the current time?
The answer highly depends on the context, so it is unsafe to uncoditionally
use a single logic for dropping the old samples here.
2023-05-08 16:52:27 -07:00
Aliaksandr Valialkin
5a02bc56fb docs/CHANGELOG.md: document 03150c8973
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4204
2023-05-08 16:30:28 -07:00
Aliaksandr Valialkin
3c0470f91e docs/vmalert.md: clarify docs regarding the support of recursive globs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4041
2023-05-08 16:21:44 -07:00
Aliaksandr Valialkin
74155afb71 docs: clarify docs after 5ee344824f
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4183
2023-05-08 16:11:44 -07:00
Aliaksandr Valialkin
185894fc5a app/vmagent/remotewrite: make more user-friendly the warning message about too small -remoteWrite.maxdiskUsagePerURL value
This is a follow-up for bc17f4828c .
While at it, document the change at docs/CHANGELOG.md .

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4195
2023-05-08 15:42:30 -07:00
Aliaksandr Valialkin
39530903bf docs/managed-victoriametrics: consistently use setup-notifications prefix for images used at docs/managed-victoriametrics/setup-notifications.md
This should simplify managment of the images belonging to the given docs.
See docs/assets/README.md for details

This is a follow-up for 4052c44ac1
2023-05-08 15:29:53 -07:00
Aliaksandr Valialkin
d906e83e5e app/vmauth: merge default_url example into multi-url example in order to reduce the amounts of text to read for the user
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4084

This is a follow-up for 041e188df8
2023-05-08 15:12:23 -07:00
Aliaksandr Valialkin
ec3943d14a app/vmselect: small cleanup after 4f3f9950d0
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3807
2023-05-08 14:57:11 -07:00
Aliaksandr Valialkin
1db9b78b88 app/vmselect: small cleanup after 68e31a6000
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3811
2023-05-08 14:34:37 -07:00
Aliaksandr Valialkin
80946f06c2 app/{vmselect,vmctl}: move ParseTime() to lib/promutils
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4091

This is a follow-up for e2053baf32
2023-05-08 14:17:57 -07:00
Aliaksandr Valialkin
6601fa3e9d docs/CHANGELOG.md: typo fix after 45a551df9c: 'this doc' -> 'this feature request' 2023-05-08 13:41:58 -07:00
Aliaksandr Valialkin
92a549bccb app/vmauth/README.md: mention about ip filters and concurrency limiter at Security chapter 2023-05-08 13:35:58 -07:00
Aliaksandr Valialkin
23595465b8 app/vmauth: refer ip_filters option in example auth config
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3491
2023-05-08 13:29:18 -07:00
Aliaksandr Valialkin
8f43f496d7 docs: document IP filters functionality in vmauth
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3491

This is a follow-up for 2f08ed3be2
2023-05-08 12:12:16 -07:00
Aliaksandr Valialkin
09268d41ed app/vmauth: remove duplicate mentioning of -auth.config value in error message in logs on usuccessful load of -auth.config
This is a follow-up for 25759082f4
2023-05-08 10:16:16 -07:00
Aliaksandr Valialkin
1b288e0a05 all: update Go builder from Go1.20.3 to Go1.20.4
See https://github.com/golang/go/issues?q=milestone%3AGo1.20.4+label%3ACherryPickApproved
2023-05-08 09:40:55 -07:00
Aliaksandr Valialkin
24eeb8d9af docs/CHANGELOG.md: document c77385e78f 2023-05-08 08:50:30 -07:00
Alexander Marshalov
8225a48b56 fixed vm_promscrape_config_last_reload_successful metric value recovery after successful reloading with unchanged content (#4260) (#4268)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-08 13:32:51 +02:00
Roman Khavronenko
fa3a17938e vmalert: follow-up after cae87da (#4269)
* vmalert: follow-up after cae87da

cae87da4bb
Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: update struct comments

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: rm typo

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-08 13:31:54 +02:00
Alexander Marshalov
c0cb3e4f98 update generated docs for operator (#4267)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-08 12:39:44 +02:00
Haleygo
cae87da4bb vmalert: support reading rule from http url (#4212)
vmalert: support reading rule's config from HTTP URL
2023-05-08 09:52:57 +02:00
Roman Khavronenko
5b8450fc1b app/vmalert: detect alerting rules which don't match any series at all (#4198)
app/vmalert: detect alerting rules which don't match any series at all

vmalert starts to understand /query responses which contain object:
```
"stats":{"seriesFetched": "42"}
```
If object is present, vmalert parses it and populates a new field
`SeriesFetched`. This field is then used to populate the new metric
`vmalert_alerting_rules_last_evaluation_series_fetched` and to
display warnings in the vmalert's UI.

If response doesn't contain the new object (Prometheus or
VictoriaMetrics earlier than v1.90), then `SeriesFetched=nil`.
In this case, UI will contain no additional warnings.
And `vmalert_alerting_rules_last_evaluation_series_fetched` will
be set to `-1`. Negative value of the metric will help to compile
correct alerting rule in follow-up.

Thanks for the initial implementation to @Haleygo
See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4056

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-08 09:36:39 +02:00
Zakhar Bessarab
d71a2605d1 deployment/docker: allow overriding docker namespace (#4265)
It makes it easier for users who build and self-host images to publish their images without changing tags manually.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-05-08 10:34:05 +04:00
Roman Khavronenko
01520d3e5d alerts: update TooHighMemoryUsage threshold (#4256)
It appears that 90% usage for anonymous mem usage
is already concerning. So we lowering the threshold to 80%.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-07 22:18:56 +02:00
Nikolay
8f4de6fa47 lib/storage: properly update link for entry at dateMetricID cache (#4258)
previously during sync for mutable and immutable cache parts, link for hotEntry with current date may be not properly updated
it corrupts cache for backfilling metrics and increased cpu load
2023-05-05 21:45:47 -07:00
Zakhar Bessarab
4e71003620 lib/promscrape/discovery/kubernetes: follow-up for d5e94721db (#4255)
- add changelog reference to an author
- fix tests
- add metadata to match Prometheus behavior

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-05-05 14:41:17 +02:00
Vasilchenko Anton
22e65402af Add endpoint labels for pod targets discovered form endpoint but has different ports (#4253)
Signed-off-by: Vasilchenko Anton <vasilchenko-as@yandex.ru>
2023-05-05 15:46:07 +04:00
Zakhar Bessarab
aca256735c lib/storage: fix indexdb rotation infinite loop (#4249)
When using `retentionTimezoneOffset` and having local timezone being more than 4 hours different from UTC indexdb retention calculation could return negative value. This caused indexdb rotation to get in loop.
Fix calculation of offset to use `retentionTimezoneOffset` value properly and add test to cover all legit timezone configs.
See:
- https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4207
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4206

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-05-04 17:16:48 +02:00
Alexander Marshalov
56b84140a9 added new consulagent service discovery (#3953) (#4217) 2023-05-04 11:36:21 +02:00
Alexander Marshalov
2eb27ddb22 max value for memory.allowedPercent changed from 200 to 100 (#4171) (#4251)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-05-04 11:34:57 +02:00
Zakhar Bessarab
ddcc3b1e9d docs: changelog follow-up for 49b77ec01a (#4250)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-05-03 16:42:22 +04:00
justcompile
49b77ec01a squash commits (#4166) 2023-05-03 10:51:08 +02:00
Nikolay
4786f036de lib/backup: fixes path generation for windows (#4133)
replaces custom fsync function with standard Fsync methods for files.
fixes pattern matching for parts and properly generate backup path for local fs.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-05-03 10:48:53 +02:00
Nikolay
73b6c23271 lib/fs: do not panic at windows at dir deletion (#4132)
Windows doesn't allow to remove dir with opened files. Usually it's a case for snapshots, hard cannot be removed if file is openned.
With this change, dir will be renamed and properly deleted at the next process start.
It's recommended to restart vmstorage/vmsingle for snapshots deletion completion periodically.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-05-03 10:47:02 +02:00
Roman Khavronenko
baf456978d vmselect: exit early from queue on context cancel (#4223)
* vmselect: exit early from queue on context cancel

When `-search.maxConcurrentRequests` is reached, vmselect puts
request in the queue. It is expected, that requests in the queue
will be processed as soon as it would be enough capacity to do so.

However, it could happen that while request was waiting its turn,
the client could have already cancel it (close the connection,
or just close the tab with UI). In this case, we should de-queue
such requests to avoid spending extra resources on them.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* app/vmselect: address review comments

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-05-03 10:42:17 +02:00
Zakhar Bessarab
bf3b6732bd lib/promscrape/discovery/kubernetes: add common labels to all ports discovered from endpoints (#4235)
* lib/promscrape/discovery/kubernetes: add common labels to all ports discovered from endpoints

Sets
`__meta_kubernetes_endpoints_name` and `__meta_kubernetes_namespace` labels to all ports of pod.
Prometheus sets those labels to all ports in pod (0ab9553611/discovery/kubernetes/endpoints.go (L267C15-L269)) even if port is not matching any service.

See: #4154

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/promscrape/discovery/kubernetes: fix test for updated discovery logic

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-05-03 02:17:33 +02:00
Artem Navoiev
4ddfc67d54 remove information of releasing graphite render api from tip section as we released it in 1.90
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-02 15:30:22 +02:00
Max Golionko
f3b829125e add vmsingle filter for health alerts (#4238) 2023-05-02 20:54:42 +08:00
Artem Navoiev
5dde81259c prepare operator docs to migration
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-01 12:32:58 +02:00
Artem Navoiev
51fbf58d89 update managed docs - prepare for migration
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-05-01 12:19:57 +02:00
Roman Khavronenko
3d955d1078 docs: note automatic conversion to ms for influx protocol (#4224)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-30 17:13:35 +04:00
Artem Navoiev
a5fb8b93a8 add wight do trobubleshooting docs
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-30 12:33:36 +02:00
Artem Navoiev
ee1da35071 prepare static docs to migration
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-30 12:33:36 +02:00
Roman Khavronenko
3383f12a4b vmalert: fix API to return non-nil values (#4222)
Properly return empty slices instead of nil for `/api/v1/rules` and `/api/v1/alerts` API handlers.
This improves compatibility with Grafana.

 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4221

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-28 10:08:29 +02:00
Roman Khavronenko
eb746a4dab Revert "http server: limit max concurrent requests (#4185)" (#4215)
This reverts commit 77f76371

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-27 13:02:47 +02:00
Roman Khavronenko
29e059e49c app/vmalert: follow-up after 6c322b4a00 (#4214)
6c322b4a00

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-27 13:02:21 +02:00
Haleygo
6c322b4a00 vmalert: allow configuring custom notifier headers per group (#4088)
vmalert: allow configuring custom notifier headers per group
2023-04-27 12:17:26 +02:00
Zakhar Bessarab
9e99f2f5b3 lib/streamaggr: discard samples with timestamps outside of aggregation interval (#4199)
* lib/streamaggr: discard samples with timestamps not matching aggregation interval

Samples with timestamps lower than `now - aggregation_interval` are likely to be written via backfilling and should not be used for calculation of aggregation.
See #4068

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* lib/streamaggr: make log message more descriptive, fix imports

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-27 11:59:49 +02:00
Haleygo
03150c8973 lib/opentsdbhttp: fix a typo preventing from using writeconcurrencylimiter (#4208) 2023-04-27 09:22:42 +02:00
Zakhar Bessarab
b21a55febf app/vmalert: add support of recursive path globs for rules and templates (#4148)
Supports using `**` for `-rule` and `-rule.templates`: `dir/**/*.tpl` loads contents of dir and all subdirectories recursively.

See: #4041

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Artem Navoiev <tenmozes@gmail.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
2023-04-26 19:20:22 +02:00
Nikolay
5ee344824f lib/promscrape: adds filter for consul_sd_configs: (#4184)
* lib/promscrape: adds filter for consul_sd_configs:
it allows advanced filtering for consul service discovery requests
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4183

* typo fix

* removes deprecation mentions since it's not relevant

* Update docs/CHANGELOG.md

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

---------

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-04-26 19:16:27 +02:00
Max Golionko
5c955dd876 alerts: relax job filter to support job names created by VMOperator (#4203) 2023-04-26 15:32:25 +02:00
Zakhar Bessarab
89a1c941c2 app/vmalert: return an error when using query function in -external.alert.source flag (#4191)
Templating of `-external.alert.source` is not expected to have access to the query which was causing runtime error when query function was passed as nil.
See: #4181

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-26 15:31:14 +02:00
7840vz
43ededf3a1 alerts: decrease severity to info for RecordingRulesNoData (#4089) 2023-04-26 12:57:41 +02:00
Dmytro Kozlov
bc17f4828c app/vmagent,lib/persistentqueue: show warning message if --remoteWrite.maxDiskUsagePerURL flag lower than 500MB (#4196)
* app/vmagent,lib/persistentqueue: show warning message if `--remoteWrite.maxDiskUsagePerURL` flag lower than 500MB

* app/vmagent,lib/persistentqueue: linter fix

* app/vmagent,lib/persistentqueue: fix comment
2023-04-26 13:23:01 +03:00
dmitryk-dk
368571be00 docs: fix type 2023-04-26 11:50:17 +02:00
dmitryk-dk
0660cc7128 app/vmctl: fix comments 2023-04-26 11:50:17 +02:00
dmitryk-dk
cafad95790 app/vmctl: fix comments 2023-04-26 11:50:17 +02:00
dmitryk-dk
2b34c01f6d docs/managed-victoriametrics: change emails 2023-04-26 11:50:17 +02:00
dmitryk-dk
4052c44ac1 docs/managed-victoriametrics: add notifications setup 2023-04-26 11:50:17 +02:00
Alexander Marshalov
041e188df8 added default_url field in vmauth users config (#4084) (#4156)
* added default url field in vmauth users config (#4084)

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-26 11:04:35 +02:00
Yury Molodov
4f3f9950d0 vmui: add metric relabel debug (#3889)
* feat: add metric relabel debug (#3807)

* fix: add link to relabeling cookbook

* lib/promrelabel: merge, fix conflicts

* lib/promrelabel: fix diff

* docs/vmui: add metric relabel playground

---------

Co-authored-by: dmitryk-dk <kozlovdmitriyy@gmail.com>
2023-04-26 11:53:29 +03:00
Alexander Marshalov
45a551df9c changelog for issue #4083 (#4197)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-26 10:50:44 +02:00
Yury Molodov
cf567badcf vmui: display heatmap in the Explore Metrics (#4124)
* feat: display heatmap in the explore metrics (#4111)

* fix: correct calc step for heatmap

* fix: remove spaces in the result of getDurationFromMilliseconds
2023-04-25 13:14:57 +03:00
Yury Molodov
a80b0aebe8 vmui: add a comparison of data to the Cardinality Explorer (#4123)
* feat: add button "show today" to date picker

* feat: add comparison with the prev day (#3967)

* vmui/docs: add comparison of data to cardinality page
2023-04-25 12:21:57 +03:00
Yury Molodov
752895d1ee docs/vmui: fix CHANGELOG.md about WITH templates (#4194) 2023-04-25 12:05:14 +03:00
Yury Molodov
68e31a6000 vmui: Integrate WITH template playground (#3831)
* feat: add WithTemplate page

* app/vmselect/prometheus: enable json mode for expand with expr API

* app/vmselect/prometheus: enable CORS and add content type

* feat: add api for expand with templates

* fix: remove console from useExpandWithExprs

* app/vmselect/prometheus: fix escaping

* vmui:  integrate WITH template

* app/vmctl: check content type instead of form param

* fix: add content-type for fetch with-exprs

* fix: add a header to the server's response that allows the "Content-Type" header

* app/vmctl: added comment and cleanup

* app/vmctl: use format query param

---------

Co-authored-by: dmitryk-dk <kozlovdmitriyy@gmail.com>
2023-04-25 11:40:01 +03:00
Dmytro Kozlov
e2053baf32 app/vmctl: add support for the different time format in the native binary protocol (#4189)
* app/vmctl: add support for the different time format in the native binary protocol

* app/vmctl: update flag description, update CHANGELOG.md

* app/vmctl: add comment to exported function
2023-04-24 18:33:30 +02:00
Alexander Marshalov
73e22dcf81 added unauthorized_user field in vmauth users config (#4083) (#4157)
added `unauthorized_user` field in vmauth users config (#4083)

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-24 14:57:13 +02:00
Roman Khavronenko
77f76371d0 http server: limit max concurrent requests (#4185)
* lib/httpserver: introduce `-http.maxConcurrentRequests` command-line flag

Introduce `-http.maxConcurrentRequests` command-line flag to protect
VM components from resource exhaustion during unexpected spikes of HTTP requests.
By default, the new flag's value is set to 0 which means no limits are applied.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* lib/httpserver: mention http.maxConcurrentRequests in docs

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-24 14:52:06 +02:00
Alexander Marshalov
31e174977e doc improvements (#4172) (#4186)
- added info about metric `vm_vminsert_metrics_read_total`,
- small doc refactoring
- and added make-command for running docs in docker.

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-24 13:38:00 +02:00
Yury Molodov
05ab34f2c8 vmui: fix freeze when query regular with heatmap query (#4093)
* fix: fix freeze when query regular with heatmap query

* vmui/docs: fix freeze when query regular with heatmap query
2023-04-21 11:59:09 +03:00
Yury Molodov
3140aa34de vmui: fix bug where tenant list was not displayed (#4162)
* fix: modify the condition for querying tenants

* fix: change getTenantIdFromUrl output to string
2023-04-21 11:56:08 +03:00
Alexander Marshalov
25759082f4 vmauth ip filters (refactoring) (#4059)
Added ip filters (allow_list and deny_list) for enterprise-version of vmauth (#3491)

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-20 19:08:27 +02:00
Artem Navoiev
87b925afb2 move note about opensource of graphite in v1.90 release note
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-19 16:34:00 +02:00
Roman Khavronenko
de61a73c63 vmalert: retry datasource requests with EOF or unexpected EOF errors (#4146)
* vmalert: retry datasource requests with EOF or unexpected EOF errors

Retry failed read request on the closed connection one more time.
This may improve rules execution reliability when connection
between vmalert and datasource closes unexpectedly.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: fix old tests

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-04-19 10:18:32 +02:00
Zakhar Bessarab
472fe3fd03 lib/httpserver: add handler to serve /robots.txt and deny search indexing (#4143)
This handler will instruct search engines that indexing is not allowed for the content exposed to the internet. This should help to address issues like #4128 when instances are exposed to the internet without authentication.
2023-04-18 16:47:26 +04:00
Balamurugan Krishnamoorthy (Bala)
06aefcec81 Removed duplicate third-party article reference (#4142)
"How do We Keep Metrics for a Long Time in VictoriaMetrics" article is referenced twice in "Third-party articles and slides about VictoriaMetrics" section
2023-04-18 14:03:06 +04:00
Artem Navoiev
413701454c add graphite render api opensource to changelog
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-18 11:09:02 +02:00
Aliaksandr Valialkin
2a4c48c59d lib/{mergeset,storage}: make mustReadPartNames() code more clear 2023-04-14 23:16:59 -07:00
Aliaksandr Valialkin
52006149b2 lib/storage: replace OpenStorage() with MustOpenStorage()
Callers of OpenStorage() log the returned error and exit.
The error logging and exit can be performed inside MustOpenStorage()
alongside with printing the stack trace for better debuggability.
This simplifies the code at caller side.
2023-04-14 23:02:40 -07:00
Aliaksandr Valialkin
2a2036160d lib/storage: fix a bug, which prevents from reading pre-v1.90.0 parts
The bug has been introduced in c0b852d50d
2023-04-14 22:33:08 -07:00
Aliaksandr Valialkin
3727251910 lib/fs: add MustReadDir() function
Use fs.MustReadDir() instead of os.ReadDir() across the code in order to reduce the code verbosity.
The fs.MustReadDir() logs the error with the directory name and the call stack on error
before exit. This information should be enough for debugging the cause of the error.
2023-04-14 22:10:46 -07:00
Aliaksandr Valialkin
60d92894c5 lib/storage: validate rows in partition.AddRows() only during tests 2023-04-14 20:52:36 -07:00
Aliaksandr Valialkin
df619bdff0 all: consistently use fs.MustClose() for closing lock files 2023-04-14 20:14:21 -07:00
Aliaksandr Valialkin
2a3b19e1d2 lib/fs: convert CreateFlockFile to MustCreateFlockFile
Callers of CreateFlockFile log the returned err and exit.
It is better to log the error inside the MustCreateFlockFile together with the path
to the specified directory and the call stack. This simplifies
the code at the callers' side while leaving the debuggability at the same level.
2023-04-14 19:50:01 -07:00
Aliaksandr Valialkin
c0b852d50d lib/{storage,mergeset}: convert InitFromFilePart to MustInitFromFilePart
Callers of InitFromFilePart log the error and exit.
It is better to log the error with the path to the part and the call stack
directly inside the MustInitFromFilePart() function.
This simplifies the code at callers' side while leaving the same level of debuggability.
2023-04-14 15:46:12 -07:00
Aliaksandr Valialkin
9183a439c7 lib/filestream: change Create() to MustCreate()
Callers of this function log the returned error and exit.
It is better logging the error together with the path to the filename
and call stack directly inside the function. This simplifies
the code at callers' side without reducing the level of debuggability
2023-04-14 15:12:48 -07:00
Aliaksandr Valialkin
5eb163a08a lib/filestream: transform Open() -> MustOpen()
Callers of this function log the returned error and exit.
Let's log the error with the path to the filename and call stack
inside the function. This simplifies the code at callers' side
without reducing the level of debuggability.
2023-04-14 15:03:42 -07:00
Aliaksandr Valialkin
fda1a54343 lib/fs: improve error logging at ReaderAt.MustReadAt()
- Add 'BUG:' prefix to error messages related to programming errors aka bugs.
- Consistently log the path to the file in all the messages in order to improve debuggability.
2023-04-14 14:51:06 -07:00
Aliaksandr Valialkin
f341b7b3f8 lib/fs: substitute ReadFullData with MustReadData
Callers of ReadFullData() log the error and then exit.
So let's log the error with the path to the filename and the call stack
inside MustReadData(). This simplifies the code at callers' side,
while leaving the debuggability at the same level.
2023-04-14 14:39:29 -07:00
Aliaksandr Valialkin
bd6de6406a lib/fs: improve error logging inside MustWriteData
Log the path to file on errors inside MustWriteData().
This improves debuggability of errors, which may occur inside MustWriteData().
2023-04-14 14:32:45 -07:00
Aliaksandr Valialkin
4b43c91f8c vendor: update github.com/VictoriaMetrics/metricsql from v0.56.1 to v0.56.2
This fixes panic when the duration in the query contains `M` suffix.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4120
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3589
2023-04-14 14:05:56 -07:00
Aliaksandr Valialkin
e0595af2bf lib/{mergeset,storage}: remove isInMerge flag from parts only when they werent removed yet from the list of active parts
This prevents from possible panic during access to pw.p when it is set to nil at partWrapper.decRef() called inside swapSrcWithDstParts()
2023-04-14 00:08:11 -07:00
Aliaksandr Valialkin
88a4b9c313 docs/vmctl.md: run make docs-sync after 2a5b9ff782 2023-04-13 23:54:41 -07:00
Aliaksandr Valialkin
5e87e03409 docs/CHANGELOG.md: move the bugfix description into the correct place
This is a follow-up for 2a5b9ff782

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4092
2023-04-13 23:49:53 -07:00
Aliaksandr Valialkin
9f8209d593 docs/CHANGELOG.md: run at least 4 background mergers on systems with less than 4 CPU cores
This reduces the probability of sudden spike in the number of small parts when all the background mergers
are busy with big merges.
2023-04-13 23:43:17 -07:00
Aliaksandr Valialkin
550d5c7ea4 lib/{mergeset,storage}: make sure that getFlushToDiskDeadline() takes into account only in-memory parts 2023-04-13 23:43:17 -07:00
Dmytro Kozlov
2a5b9ff782 app/vmctl: fix performance degradation, add flag to disable backoff policy (#4097)
* app/vmctl: change api for getting metric names

* app/vmctl: fix tests

* app/vmctl: add flag to enable backoff policy, fix test, performance improvements

* app/vmctl: use one http client

* app/vmctl: made linter happy

* app/vmctl: updated documentation and CHANGELOG.md

* app/vmctl: cleanup

* app/vmctl: rename flag

* app/vmctl: cleanup

* app/vmctl: fix comments

* app/vmctl: fix metrics parser problem, improve tests
2023-04-14 09:34:54 +03:00
Aliaksandr Valialkin
809fbaeaac lib/fs: add Must prefix to CopyDirectory and CopyFile functions
Callers of these functions log the returned error and then exit.
Let's log the error with the call stack inside the function itself.
This simplifies the code at callers' side, while leaving the same
level of debuggability in case of errors.
2023-04-13 23:02:59 -07:00
Aliaksandr Valialkin
780abc3b3b lib/fs: rename SymlinkRelative to MustSymlinkRelative
Callers of this function log the returned error and then exit.
Let's log the error with the call stack inside the function itself.
This simplifies the code at callers' side, while leaving the same
level of debuggability in case of errors.
2023-04-13 22:52:55 -07:00
Aliaksandr Valialkin
5f487ed996 lib/fs: rename HardLinkFiles to MustHardLinkFiles
Callers of this function log the returned error and then exit.
Let's log the error with the call stack inside the function itself.
This simplifies the code at callers' side, while leaving the same
level of debuggability in case of errors.
2023-04-13 22:48:07 -07:00
Aliaksandr Valialkin
30425ca81a lib/fs: rename WriteFileAtomically to MustWriteAtomic
Callers of this function log the returned error and exit.
So let's just log the error with the given filepath and the call stack
inside the function itself and then exit. This simplifies the code
at callers' place while leaves the same level of debuggability in case of errors.
2023-04-13 22:41:15 -07:00
Aliaksandr Valialkin
036a7b7365 lib/fs: replace MkdirAllIfNotExist->MustMkdirIfNotExist and MkdirAllFailIfExist->MustMkdirFailIfExist
Callers of these functions log the returned error and then exit. The returned error already contains the path
to directory, which was failed to be created. So let's just log the error together with the call stack
inside these functions. This leaves the debuggability of the returned error at the same level
while allows simplifying the code at callers' side.

While at it, properly use MustMkdirFailIfExist instead of MustMkdirIfNotExist inside inmemoryPart.MustStoreToDisk().
It is expected that the inmemoryPart.MustStoreToDick() must fail if there is already a directory under the given path.
2023-04-13 22:11:59 -07:00
Aliaksandr Valialkin
344209e5e6 lib/fs: rename MustWriteFileAndSync to MustWriteSync in order to improve readability a bit
This is a follow-up for 2a8395be05
2023-04-13 21:43:32 -07:00
Aliaksandr Valialkin
b15c5961ab lib/{mergeset,storage}: remove unused path field from blockStreamWriter
This is a follow-up after 42bba64aa7
2023-04-13 21:39:59 -07:00
Aliaksandr Valialkin
2a8395be05 lib/fs: replace WriteFileAndSync with MustWriteAndSync
When WriteFileAndSync fails, then the caller eventually logs the error message
and exits. The error message returned by WriteFileAndSync already contains the path
to the file, which couldn't be created. This information alongside the call stack
is enough for debugging the issue. So just use log.Panicf("FATAL: ...") inside MustWriteAndSync().
This simplifies error handling at caller side a bit.
2023-04-13 21:33:19 -07:00
Aliaksandr Valialkin
25f089de9d lib/{mergeset,storage}: properly fsync part directory listing after writing in-memory part to disk
This is a follow-up after 42bba64aa7

Previously the part directory listing was fsync'ed implicitly inside partHeader.WriteMetadata()
by calling fs.WriteFileAtomically(). Now it must be fsync'ed explicitly.

There is no need in fsync'ing the parent directory, since it is fsync'ed by the caller
when updating parts.json file.
2023-04-13 21:19:04 -07:00
Aliaksandr Valialkin
42bba64aa7 lib/{mergeset,storage}: explicitly fsync the created part directory listing
Previously the created part directory listing was fsynced implicitly
when storing metadata.json file in it.

Also remove superflouous fsync for part directory listing,
which was called at blockStreamWriter.MustClose().
After that the metadata.json file is created, so an additional fsync
for the directory contents is needed.
2023-04-13 21:03:08 -07:00
Aliaksandr Valialkin
e1211a1187 app/vmstorage: deprecate -bigMergeConcurrency command-line flag
Improperly configured -bigMergeConcurrency command-line flag usually leads to uncontrolled
growth of unmerged parts, which, in turn, increases CPU usage and query durations.

So it is better deprecating this flag. In rare cases -smallMergeConcurrency command-line flag
can be used instead for controlling the concurrency of background merges.
2023-04-13 20:40:24 -07:00
Aliaksandr Valialkin
ca54e58c1f lib/{fs,persistentqueue}: use filepath.Join() instead of concatenating path parts with /
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4014
2023-04-13 20:13:45 -07:00
Aliaksandr Valialkin
90b876cd1e app/vmbackupmanager: sync with enterprise-single-node branch after 41a54c775891c87e3d5ed59ff0769c869dd2fe71 2023-04-13 19:29:06 -07:00
Alexander Marshalov
47e16594dd Added extra information to docs about total output in stream aggregation. (#4130)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-04-13 16:08:07 +02:00
Denys Holius
174a70369c deleted unnecessary file (#4126) 2023-04-12 20:55:39 +04:00
Zakhar Bessarab
81f28f0f1f lib/backup/actions: store metadata(creation and completion time) in backup files (#4117)
This makes it easier to understand exact point in time which is included in this backup.

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-12 18:51:27 +02:00
Denys Holius
51466ce379 Fix/snap build (#4125)
* snap/snapcraft.yaml: bump go-channel to latest 1.20/stable

* snap/local/Makefile: fixed typo in output snap file
2023-04-12 18:50:56 +02:00
Aliaksandr Valialkin
b9ab07ced9 vendor: update github.com/valyala/gozstd from v1.19.0 to v1.19.1 2023-04-10 11:30:13 -07:00
Aliaksandr Valialkin
1fe87691ec app/vmbackupmanager/README.md: sync with docs/vmbackupmanager.md after 4b2cc1b32c 2023-04-10 10:51:49 -07:00
Aliaksandr Valialkin
7849545e78 docs/Single-server-VictoriaMetrics.md: document automatic switch from graph view to heatmap view for histogram buckets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3384
2023-04-10 10:49:44 -07:00
Aliaksandr Valialkin
ddb8883817 docs/Troubleshooting.md: mention that it is recommended to use default command-line flag values for VictoriaMetrics components 2023-04-10 10:48:25 -07:00
Haleygo
0ad6010c91 fix sort pendingDateMetricsIDs (#4102) 2023-04-10 10:23:12 -07:00
Aliaksandr Valialkin
b7cce552da vendor: make vendor-update 2023-04-10 10:20:54 -07:00
Aliaksandr Valialkin
0e1e0b036a deployment: update VictoriaMetrics from v1.89.1 to v1.90.0
See https://docs.victoriametrics.com/CHANGELOG.html#v1900
2023-04-06 19:08:11 -07:00
Aliaksandr Valialkin
9586dabd94 docs/guides: update VictoriaMetrics from v1.89.1 to v1.90.0 2023-04-06 19:07:38 -07:00
Aliaksandr Valialkin
05d44525b7 docs/CHANGELOG.md: formatting fix 2023-04-06 19:04:13 -07:00
Aliaksandr Valialkin
b5d18c0d28 app/vmctl/terminal: fix builds for GOOS=freebsd and GOOS=openbsd
This is a follow-up for 8da9502df6
2023-04-06 17:09:07 -07:00
Aliaksandr Valialkin
28975067c6 docs/CHANGELOG.md: cut v1.90.0 release 2023-04-06 16:16:42 -07:00
Aliaksandr Valialkin
a3eebf118e app/vmselect/vmui: run make vmui-update after 01fc228fb0 2023-04-06 15:07:41 -07:00
Dmytro Kozlov
244c18fa38 app/vmctl: add multiple filters defined in --vm-native-filter-match flag to discovered metric names (#4063)
* app/vmctl: add multiple filters defined in `--vm-native-filter-match` flag to discovered metric names

* app/vmctl: fix comments

* app/vmctl: move function buildMatchWithFilter to the correct place

* app/vmctl: update CHANGELOG.md

* app/vmctl: fix CI, remove error wrapping

* app/vmctl: fix CI, simplify `Set()`
2023-04-06 15:06:52 -07:00
Yury Molodov
01fc228fb0 vmui: heatmap fixes (#4086)
* fix: correct display of errors for query

* fix: change the logic of histogram detection

* feat: hide empty buckets from the graph

* fix: revert server url
2023-04-06 15:02:44 -07:00
Aliaksandr Valialkin
ee80e71d17 docs/CHANGELOG.md: document the bugfix, which remove unneeded logger.Errorf() call during stream aggregation with the enabled deduplication
This is a follow-up for ff72ca14b9
2023-04-06 15:00:42 -07:00
Aliaksandr Valialkin
4770377fb3 app/vmselect/vmui: run make vmui-update after a1601929ec 2023-04-06 03:20:13 -07:00
Aliaksandr Valialkin
44aad84a53 docs/CHANGELOG.md: document that VictoriaMetrics for Windows cannot delete snapshots
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70#issuecomment-1491529183
2023-04-06 03:16:06 -07:00
Aliaksandr Valialkin
608d87273d docs/CHANGELOG.md: document v1.79.12 2023-04-06 03:10:01 -07:00
Aliaksandr Valialkin
7a65329e65 docs/CHANGELOG.md: document v1.87.5 2023-04-06 00:44:07 -07:00
Timur Bakeyev
37a7627254 Fix cut-n-paste error (#4079)
It seems that VMServiceScrape description was c-n-p from vmselect one into all other resources.
2023-04-06 11:02:12 +04:00
Yury Molodov
a1601929ec fix: correct the description of shortcut keys (#4057) 2023-04-05 22:19:36 -07:00
Zakhar Bessarab
4b2cc1b32c docs: fix example operator spec for vmbackupmanager restore usage (#4074)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-05 22:16:39 -07:00
Yury Molodov
74eea53dee vmui: implement heatmap improvements (#4078)
* fix: disabled limits for histogram

* fix: add sorted buckets by upper bound

* refactor: move line chart components to folder

* feat: implement heatmap improvements (https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3384#issuecomment-1484023162)

* app/vmselect/vmui: `make vmui-update`

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-04-05 22:13:57 -07:00
Aliaksandr Valialkin
593c151831 lib/encoding: fix test after 4725549cb2 2023-04-05 21:38:37 -07:00
Aliaksandr Valialkin
4725549cb2 vendor: update github.com/klauspost/compress from v1.16.3 to v1.16.4
See https://github.com/klauspost/compress/releases/tag/v1.16.4
2023-04-05 21:25:35 -07:00
Aliaksandr Valialkin
29a692f278 vendor: update github.com/valyala/gozstd from v1.18.0 to v1.19.0 2023-04-05 20:53:30 -07:00
Timur Bakeyev
d87a700528 Fix reference to the imagepullsecrets description (#4080)
Looks like the original document has moved to the https://kubernetes.io/docs/concepts/containers/images/#referring-to-an-imagepullsecrets-on-a-pod.
Alternatively, it could be that https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ describes the meaning of the parameter in more detail.
2023-04-05 19:56:28 -07:00
Aliaksandr Valialkin
8249451dbb docs/Troubleshooting.md: add missing help word after c7bcf750c2d031b1259cd8115d7464b67f40cb9eg 2023-04-05 14:28:09 -07:00
Aliaksandr Valialkin
13668fc935 docs/Troubleshooting.md: another typo fixes after c7bcf750c2d031b1259cd8115d7464b67f40cb9eg 2023-04-05 14:14:41 -07:00
Aliaksandr Valialkin
052160dcdc docs/Troubleshooting.md: fix formatting after c7bcf750c2 2023-04-05 13:46:19 -07:00
Aliaksandr Valialkin
a265da4f53 docs/Troubleshooting.md: fix a typo in the link after c7bcf750c2 2023-04-05 13:40:24 -07:00
Aliaksandr Valialkin
5074cc672a all: update Go builder from Go1.20.2 to Go1.20.3
See https://github.com/golang/go/issues?q=milestone%3AGo1.20.3+label%3ACherryPickApproved
2023-04-05 13:37:22 -07:00
Aliaksandr Valialkin
c7bcf750c2 docs/Troubleshooting.md: add General troubleshooting checklist
This checklist helps searching for the infromation related to some issue / question
about VictoriaMetrics
2023-04-05 13:29:54 -07:00
Artem Navoiev
59102db4cf update changelog
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-05 15:45:38 +02:00
Zakhar Bessarab
8a6acce7d3 deployment/docker: update Grafana URLs to match latest format (#4060)
See: #4019

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-04 15:26:08 +04:00
Zakhar Bessarab
a8d1497024 app/vmalert: update Grafana URLs to match latest format (#4061)
See: #4019

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-04-04 15:25:29 +04:00
Artem Navoiev
33c6cc2530 fix closing divs in docs
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-04-04 13:00:22 +02:00
Aliaksandr Valialkin
19b189e9b7 lib/storage: use shorter code after 03bde173b7 2023-04-02 21:35:52 -07:00
faceair
38fc55976e lib/storage: fix reuse pendingMetricRow (#4049) 2023-04-02 21:35:50 -07:00
Aliaksandr Valialkin
55b5276b70 docs/CHANGELOG.md: document edb45d7fc1
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4013
2023-04-02 21:26:12 -07:00
faceair
f3af8331ec lib/storage: remove unused code (#4050) 2023-04-02 21:24:42 -07:00
Aliaksandr Valialkin
de0fe02f6e app/vmselect/vmui: run make vmui-update after edb45d7fc1 2023-04-02 21:21:51 -07:00
Yury Molodov
edb45d7fc1 feat: add accept/cancel buttons for settings (#4013) (#4052) 2023-04-02 21:20:10 -07:00
Aliaksandr Valialkin
f638496298 lib/promscrape: do not re-use previously loaded scrape targets on failed attempt to load updated scrape targets at file_sd_configs
The logic employed for re-using the previously loaded scrape target was broken initially.
The commit cc0427897c tried to fix it, but the new logic
became too complex and fragile. So it is better to just remove this logic,
since the targets from temporarily broken file should be eventually loaded on next
attempts every -promscrape.fileSDCheckInterval

This also allows removing fragile hacks around __vm_filepath label.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3989
2023-04-02 21:05:28 -07:00
Dmytro Kozlov
cc0427897c lib/promscrape: fix the problem with scrape work duplicates when file_sd_config can't be read (#4027)
* lib/promscrape: fix the problem with scrape work duplicates when file_sd_config can't be read

* lib/promscrape: clarified comment

* lib/promscrape: made better approach to handle a problem with growing []*ScrapeWork on each error when loading config

* lib/promscrape: added CHANGELOG.md

* Update docs/CHANGELOG.md

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-04-02 20:26:13 -07:00
Aliaksandr Valialkin
06b721dd07 app/vmselect/vmui: run make vmui-update after 42087518ba 2023-04-01 00:40:49 -07:00
Yury Molodov
42087518ba vmui: tips for working with the graph and legend (#4045)
* feat: add tips for working with the graph and legend

* feat: add the ability to collapse the legend

* vmui/docs: add the ability to collapse the legend

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-04-01 00:38:18 -07:00
Aliaksandr Valialkin
02b714c110 vendor: make vendor-update 2023-03-31 23:59:34 -07:00
Yury Molodov
dd200409d9 vmui: add a tip for JSON and Table tabs (#4000)
* feat: add a tip for JSON and Table tabs

* feat: add Hyperlink component

* fix: update Hyperlink

* fix: update link to instant query
2023-03-31 23:53:06 -07:00
Roman Khavronenko
27b958ba8b lib/storage: check for free disk space before opening tables (#4035)
* lib/storage: check for free disk space before opening tables

We check for free disk space before call to `openTable`,
so `Storage` can be set to ReadOnly before mergeWorkers start.

Before the change, there was a chance that merges will start
even if Storage has to start in ReadOnly mode because of
`-storage.minFreeDiskSpaceBytes` limit.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4023
Signed-off-by: hagen1778 <roman@victoriametrics.com>

* lib/storage: chore

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* Update lib/storage/storage.go

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-31 23:50:27 -07:00
Aliaksandr Valialkin
ffdf430be0 app/vmselect/graphite: open source Graphite Render API 2023-03-31 23:25:04 -07:00
Aliaksandr Valialkin
cddfc4d3f8 deployment/docker: update base Docker image from Alpine 3.17.2 to Alpine 3.17.3
This fixes security issues from https://alpinelinux.org/posts/Alpine-3.17.3-released.html

This is a follow-up for 59c350d0d2
2023-03-31 22:46:27 -07:00
Aliaksandr Valialkin
4d00107b92 lib/fs: follow-up for ec45f1bc5f
Properly close response body before checking for the response code.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4034
2023-03-31 22:42:10 -07:00
Aliaksandr Valialkin
d577657fb7 lib/streamaggr: follow-up for ff72ca14b9
- Make sure that the last successfully loaded config is used on hot-reload failure
- Properly cleanup resources occupied by already initialized aggregators
  when the current aggregator fails to be initialized
- Expose distinct vmagent_streamaggr_config_reload* metrics per each -remoteWrite.streamAggr.config
  This should simplify monitoring and debugging failed reloads
- Remove race condition at app/vminsert/common.MustStopStreamAggr when calling sa.MustStop() while sa
  could be in use at realoadSaConfig()
- Remove lib/streamaggr.aggregator.hasState global variable, since it may negatively impact scalability
  on system with big number of CPU cores at hasState.Store(true) call inside aggregator.Push().
- Remove fine-grained aggregator reload - reload all the aggregators on config change instead.
  This simplifies the code a bit. The fine-grained aggregator reload may be returned back
  if there will be demand from real users for it.
- Check -relabelConfig and -streamAggr.config files when single-node VictoriaMetrics runs with -dryRun flag
- Return back accidentally removed changelog for v1.87.4 at docs/CHANGELOG.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3639
2023-03-31 22:30:38 -07:00
Max Golionko
59c350d0d2 fix: app/vmui/Dockerfile-web to reduce vulnerabilities (#4044)
The following vulnerabilities are fixed with an upgrade:
- https://snyk.io/vuln/SNYK-ALPINE317-OPENSSL-3368755
- https://snyk.io/vuln/SNYK-ALPINE317-OPENSSL-3368755
- https://snyk.io/vuln/SNYK-ALPINE317-OPENSSL-5291795
- https://snyk.io/vuln/SNYK-ALPINE317-OPENSSL-5291795

Co-authored-by: snyk-bot <snyk-bot@snyk.io>
2023-03-31 16:29:44 +02:00
Zakhar Bessarab
5e5fc66e3b docs/vmctl: add examples of URLs used for migration in different modes (#4042)
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-03-30 17:21:36 +02:00
Roman Khavronenko
4a49577028 vmalert: use missingkey=zero for templating (#4040)
Replace empty labels with "" instead of "<no value>"
during templating, as Prometheus does.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4012

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-30 16:57:00 +04:00
Zakhar Bessarab
ec45f1bc5f lib/fs: verify response code when reading configuration over HTTP (#4036)
Verifying status code helps to avoid misleading errors caused by attempt to parse unsuccessful response.

Related issue: #4034

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-03-30 13:18:00 +02:00
Daria Karavaieva
0945a03843 Vmanomaly guide index fix (#4029)
* name and scrutture change

* fix indexing

* index fix

* name change

* line separator fix
2023-03-29 20:24:06 +02:00
Alexander Marshalov
ff72ca14b9 added hot reload support for stream aggregation configs (#3969) (#3970)
added hot reload support for stream aggregation configs (#3969)

Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-03-29 18:05:58 +02:00
Eliran Barnoy
9199c23720 Fix operator links to include VMPrometheusConverter for added visibility 2023-03-29 11:44:49 +02:00
Aliaksandr Valialkin
94cabf29b0 lib/flagutil: ArrayString: support commas inside quoted strings and inside [], {} and () braces
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3915
2023-03-28 21:22:55 -07:00
Aliaksandr Valialkin
e094c8e214 docs: mention that VictoriaMetrics rounds time range to UTC days at /api/v1/labels, /api/v1/label/.../values and /api/v1/series handlers
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3107
2023-03-28 17:00:17 -07:00
Aliaksandr Valialkin
7048a316aa lib/persistentqueue: typo fix after aea6df8197 2023-03-27 20:06:04 -07:00
Aliaksandr Valialkin
aea6df8197 app/vmagent/remotewrite: cosmetic updates after f3a51e8b1d
- Compare directory names instead of paths to directory when determining which persistent queues must be deleted
  This is less error-prone solution, since paths to the same directory can differ, which could lead
  to accidental directory removal for the existing -remoteWrite.url

- Log the `removed %d dangling queues` message when at least a single queue has been removed

- Consistently use filepath.Join() for creating paths to persistent queues.
  This is needed for Windows support (see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70 )

- Clarify the description of the change at docs/CHANGELOG.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4014
2023-03-27 18:33:07 -07:00
Zakhar Bessarab
f3a51e8b1d app/vmagent: add -remoteWrite.removeDanglingQueues flag (#4017)
* app/vmagent: add `-remoteWrite.removeDanglingQueues` flag which allows to automatically remove dangling persistent queue contents

Related issue: #4014

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* app/vmagent: address review feedback

- remove persistent queues files by default
- rename `remoteWrite.removeDanglingQueues` to `remoteWrite.keepDanglingQueues`
- update docs to reflect changed behaviour

Related issue: #4014

* Apply suggestions from code review

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-27 18:15:28 -07:00
Nikolay
9b1e002287 app/vmselect: properly remove temp files at windows system (#4020)
With non-posix compliant systems it's not possible to remove unclosed files.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-27 18:10:15 -07:00
Aliaksandr Valialkin
02ee4ffd4d app/vmselect/promql: follow-up for 79e1c6a6fc
- Document the fix at docs/CHANGELOG.md
- Add tests with multiple adjancent zero buckets
- Simplify the fix a bit

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/296
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4021
2023-03-27 18:03:36 -07:00
Ze'ev Klapow
79e1c6a6fc fix le buckets when adjacent vmrange is empty (#4021)
There is a bug here where if you have a single bucket like:

foo{vmrange="4.084e+02...4.642e+02"} 2 123

The expected output is three le encoded buckets like:

foo{le="4.084e+02"} 0 123
foo{le="4.642e+02"} 2 123
foo{le="+Inf"} 2 123

This correctly encodes the start and end of the vmrange.
If however, the input contains the previous bucket, and that bucket is
empty then you only get the end le and +Inf out currently, i.e:

foo{vmrange="7.743e+05...8.799e+05"} 5 123
foo{vmrange="6.813e+05...7.743e+05"} 0 123

results in:

foo{le="8.799e+05"} 5 123
foo{le="+Inf"} 5 123

This causes issues when you go to compute a quantile because this means
that the assumed lower bound of the buckets is 0 and this we interpolate
between 0->end rather than the vmrange start->end as expected.
2023-03-27 17:54:19 -07:00
Aliaksandr Valialkin
9e02b3d48a vendor: make vendor-update 2023-03-27 15:28:02 -07:00
Aliaksandr Valialkin
622000797a app/vmselect: follow-up for 10ab086366
- Expose stats.seriesFetched at `/api/v1/query_range` responses too
  for the sake of consistency.

- Initialize QueryStats when it is needed and pass it to EvalConfig then.
  This guarantees that the QueryStats is properly collected when the query
  contains some subqueries.
2023-03-27 15:22:00 -07:00
Roman Khavronenko
4021aa11b5 app/vmselect: export seriesFetched stat for /query responses (#3925)
The change adds a new field `seriesFetched` to EvalConfig object.
Since EvalConfig object can be copied inside `Exec`,
`seriesFetched` is a pointer which can be updated by all copied
objects.

The reason for having stats is that other components, like vmalert,
could benefit from this information.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-27 15:18:25 -07:00
Yury Molodov
3214b1c315 vmui: heatmap (#3780)
* fix: add stroke and font for all axes

* feat: add util for generate gradient

* feat: add heatmap plugin

* feat: add heatmap legend

* feat: add heatmap graph (#3384)

* vmui: add heatmap graph (#3384)

* feat: add convert Prometheus to VictoriaMetrics histogram

* fix: prevent re-render graph

* feat: reset step for heatmap

* feat: normalize heatmap data

* fix: format heatmap legend

* wip

* app/vmselect/vmui: run `make vmui-update`

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-26 00:30:02 -07:00
Aliaksandr Valialkin
92199c964c docs/MetricsQL.md: quote min, max and avg args for rollup_*() functions in order to reduce the level of confusion when users try to pass the second argument to these functions 2023-03-26 00:01:51 -07:00
Aliaksandr Valialkin
72a0b49330 docs/CHANGELOG.md: document v1.87.4 LTS release 2023-03-25 22:43:59 -07:00
Aliaksandr Valialkin
5832242b44 app/vmselect/netstorage: reduce the contention at fs.ReaderAt stats collection on systems with big number of CPU cores
This optimization is based on the profile provided at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3966#issuecomment-1483208419
2023-03-25 16:37:07 -07:00
Aliaksandr Valialkin
a1e496ced6 app/vmselect/netstorage: document why runtime.Gosched() is removed at 28f054bb00
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3966
2023-03-25 16:36:51 -07:00
Zakhar Bessarab
28f054bb00 vmselect/netstorage: remove direct calls to Gosched to reduce amount of locks for global scope
using `runtime.Gosched` requires acquiring global lock to check if there are any other goroutines to perform tasks. with the latest versions of runtime it can pause running goroutines automatically without requiring to call `Gosched` directly.

Updates #3966

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-03-25 16:34:03 -07:00
Aliaksandr Valialkin
811f4a9380 app/{vmbackup,vmrestore}: publish vmbackup and vmrestore binaries for Windows
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-25 15:08:21 -07:00
Aliaksandr Valialkin
c8f2febaa1 lib/storage: consistently use OS-independent separator in file paths
This is needed for Windows support, which uses `\` instead of `/` as file separator

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-25 14:33:58 -07:00
Aliaksandr Valialkin
36bbdd7d4b lib/mergeset: consistently use OS-independent separator in file paths
This is needed for Windows support, which uses `\` instead of `/` as file separator

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-25 13:39:41 -07:00
Aliaksandr Valialkin
b14d96618c all: follow-up after 34634ec357
- Use windows.FlushFileBuffers() instead of windows.Fsync() at streamTracker.adviseDontNeed()
  for consistency with implementations for other architectures.
- Use filepath.Base() instead of filepath.Split(), since the dir part isn't used.
  This simplifies the code a bit.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-25 11:57:39 -07:00
Nikolay
34634ec357 lib/fs: adds memory map for windows (#3988)
This is a follow-up for 43b24164ef

* lib/fs: adds memory map for windows
it should improve performance for file reading

* lib/storage: replace '/' with os specific separator
it must fix an errors for windows

* lib/fs: mention windows fsync support

* lib/filestream: adds fdatasync for windows writes

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-25 11:43:19 -07:00
Aliaksandr Valialkin
2b851e69d2 app/vmselect/promql: typo fix after e7f46a0aab 2023-03-24 23:46:30 -07:00
Aliaksandr Valialkin
e7f46a0aab app/vmselect/promql: follow-up for 7205c79c5a
- Allocate and initialize seriesByWorkerID slice in a single go instead
  of initializing every item in the list separately.
  This should reduce CPU usage a bit.
- Properly set anti-false sharing padding at timeseriesWithPadding structure
- Document the change at docs/CHANGELOG.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3966
2023-03-24 23:34:37 -07:00
Zakhar Bessarab
7205c79c5a app/vmselect/promql: use lock-less approach to gather results of parallel processing for evalRollup* funcs (#4004)
* vmselect/promql: refactor `evalRollupNoIncrementalAggregate` to use lock-less approach for parallel workers computation

Locking there is causing issues when running on highly multi-core system as it introduces lock contention during results merge.

New implementation uses lock less approach to store results per workerID and merges final result in the end, this is expected to significantly reduce lock contention and CPU usage for systems with high number of cores.

Related: #3966
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* vmselect/promql: add pooling for `timeseriesWithPadding` to reduce allocations

Related: #3966
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* vmselect/promql: refactor `evalRollupFuncWithSubquery` to avoid using locks

Uses same approach as `evalRollupNoIncrementalAggregate` to remove locking between workers and reduce lock contention.

Related: #3966
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
2023-03-24 23:07:12 -07:00
Aliaksandr Valialkin
9436ae3b07 app/vmbackup: simplify code a bit after 5ba347bd2c
Unconditionally call deleteSnapshot() func just after making the snapshot, either successful or unsuccessful

Related issue: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2055
2023-03-24 22:03:14 -07:00
Zakhar Bessarab
5ba347bd2c app/vmbackup: delete created snapshot in case of error during backup (#4008)
Related issue: #2055

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-24 21:49:58 -07:00
Aliaksandr Valialkin
25446a7933 vendor: run make vendor-update 2023-03-24 18:08:06 -07:00
Aliaksandr Valialkin
ebc1caa5dc app/vmselect/vmui: run make vmui-update after dc2c712a29 2023-03-24 18:01:39 -07:00
Aliaksandr Valialkin
27f9a1eda2 docs/CHANGELOG.md: cosmetic fixes: remove trailing whitespace and consistently use -flag instead of --flag 2023-03-24 15:44:33 -07:00
Alexander Marshalov
7c86dcc4fa allowed using dashes and dots in environment variables names (#4009)
* allowed using dashes and dots in environment variables names for templating config files with envtemplate (#3999)

Signed-off-by: Alexander Marshalov <_@marshalov.org>

* Apply suggestions from code review

---------

Signed-off-by: Alexander Marshalov <_@marshalov.org>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-24 15:43:05 -07:00
Aliaksandr Valialkin
c1d871a45a docs/vmauth.md: follow-up for 36edba9bfb
- Document `-configCheckInterval` command-line flag in `quick start` section
- Clarify the addition of `-configCheckInterval` at docs/CHANGELOG.md

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3990
2023-03-24 13:22:37 -07:00
Aliaksandr Valialkin
54796f69db docs/vmagent.md: clarify that there is no need to specify multiple -remoteWrite.url options when writing data to a single VictoriaMetrics cluster when data replication is needed
Also add a link to https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format from `getting started` section,
so users could quickly find how to write data to VictoriaMetrics cluster
2023-03-24 13:07:33 -07:00
Roman Khavronenko
365d2ff0bf vmalert: add anchor char to Group's link (#4006)
This should help users to see that Group's name is clickable
and used for anchoring.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-24 09:48:43 +01:00
Roman Khavronenko
8db6a71f83 vmalert: mention VMUI example for alert's source (#4005)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-24 09:40:55 +01:00
Roman Khavronenko
edeb56d208 docs: mention cluster URL for exporting series (#4002)
docs: mention cluster URL for exporting series

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-23 20:55:47 +01:00
Daria Karavaieva
24938872a6 Vmanomaly guide (#3834)
Setting up VMAnomaly on NodeExporter metrics with VictoriaMetrics and AlertManager.


* vmanomaly-guide-draft

* aletr graphs and description

* readme vmanomaly tutorial

* Added back fit_every param for performance

* vmanomaly guide fixes

* added spaces div

* spaces + resize image

* alert example grammar

* quotation marks

* docker link

* typo fixed

* more links

* reader section rephrased

* label change

* lower case for grafana service

* lower case for vm service

* yaml markdown

---------

Co-authored-by: Dima Lazerka <dima@victoriametrics.com>
2023-03-23 21:04:44 +02:00
Dmytro Kozlov
ba505dd357 docs: follow up after dc2c712a29 (#4001) 2023-03-23 18:27:55 +01:00
Dmytro Kozlov
dc2c712a29 app/vmui: update cardinality page (#3986)
vmui: update cardinality page

---------

Co-authored-by: Yury Moladau <yurymolodov@gmail.com>
2023-03-23 18:18:02 +01:00
Yury Molodov
023c65968f vmui: display errors for each query individually (#3987) (#3994) 2023-03-23 13:10:59 +01:00
Alexander Marshalov
36edba9bfb added configCheckInterval flag for vmauth (#3990) (#3991)
* added configCheckInterval flag for vmauth (#3990)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-03-23 09:34:12 +01:00
Nikolay
a2f716b6cc lib/netutil: log only parsing errors for proxy-protocol (#3985)
* lib/netutil: log only parsing errors for proxy-protocol

Previosly every error was logged. With configured TCP health checks at load-balancer or kubernetes, vmauth spams a lot of false positive error message into logs

* Update docs/CHANGELOG.md

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

* Update lib/netutil/tcplistener.go

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-03-21 10:22:39 -07:00
Dmytro Kozlov
f0b09a1382 app/vmctl: follow up after aed59b9029 (#3983) 2023-03-21 15:53:53 +01:00
Aliaksandr Valialkin
6a78755b66 docs/vmagent.md: mention in docs that the target relabel debug page shows target url now 2023-03-20 22:20:03 -07:00
Dmytro Kozlov
e79cd24807 lib/promrelabel: make target url from labels on target relabel page (#3882)
* lib/promrelabel: make target url from labels on target relabel page

* wip

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2023-03-20 22:07:52 -07:00
Aliaksandr Valialkin
e480b9881e app/vmselect/promql: pass workerID to the callback inside doParallel()
This opens the possibility to remove tssLock from evalRollupFuncWithSubquery()
in the follow-up commit from @zekker6 in order to speed up the code
for systems with many CPU cores.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3966
2023-03-20 20:54:57 -07:00
Aliaksandr Valialkin
9e16329b2f app/vmselect/promql: fix TestIncrementalAggr test on systems less than 3 CPU cores
This is a follow-up for 4856a4cf5a
2023-03-20 20:37:18 -07:00
Aliaksandr Valialkin
70959d5dab app/vmselect/netstorage: reduce the number of calls to runtime.Gosched() at timeseriesWorker() and unpackWorker()
Call runtime.Gosched() only when there is a work to steal from other workers.
Simplify the timeseriesWorker() and unpackWroker() code a bit by inlining stealTimeseriesWork() and stealUnpackWork().

This should reduce CPU usage when processing queries on systems with big number of CPU cores.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3966
2023-03-20 20:31:02 -07:00
Aliaksandr Valialkin
4856a4cf5a app/vmselect: optimize incremental aggregates a bit
Substitute sync.Map with an ordinary slice indexed by workerID.
This should reduce the overhead when updating the incremental aggregate state
2023-03-20 15:37:06 -07:00
Aliaksandr Valialkin
8622dee4b5 app/vmselect/vmui: make vmui-update after d4525bd2d0 2023-03-20 14:35:03 -07:00
Aliaksandr Valialkin
8d709f3483 docs/CHANGELOG.md: cosmetic fixes 2023-03-20 14:14:20 -07:00
Aliaksandr Valialkin
91533531f5 docs/Troubleshooting.md: document an additional case, which could result in slow inserts
If `-cacheExpireDuration` is lower than the interval between ingested samples for the same time series,
then vm_slow_row_inserts_total` metric is increased.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183
2023-03-20 13:28:36 -07:00
Roman Khavronenko
3283f0dae4 vmalert: support logs suppressing during config reloads (#3973)
* vmalert: support logs suppressing during config reloads

The change is mostly required for ENT version of vmalert,
since it supports object-storage for config files.
Reading data from object storage could be time-consuming,
so vmalert emits logs to track the progress.

However, these logs are mostly needed on start or on
manual config reload. Printing these logs each time
`rule.configCheckInterval` is triggered would too verbose.
So the change allows to control logs emitting during
config reloads.

Now, logs are emitted during start up or when SIGHUP is receieved.
For periodicall config checks logs emitted by config pkg are suppressed.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: review fixes

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-20 16:08:30 +01:00
Dmytro Kozlov
8da9502df6 app/vmctl: automatically check tty (#3938)
app/vmctl: automatically detect if TTY is available
2023-03-20 11:16:08 +01:00
Yury Molodov
d4525bd2d0 vmui: support for drag'n'drop in the "Trace analyzer" page (#3971)
vmui: add drag-and-drop support for the trace analyzer page
2023-03-20 11:07:18 +01:00
dependabot[bot]
cc67eb4ff3 build(deps): bump actions/setup-go from 3 to 4 (#3962)
Bumps [actions/setup-go](https://github.com/actions/setup-go) from 3 to 4.
- [Release notes](https://github.com/actions/setup-go/releases)
- [Commits](https://github.com/actions/setup-go/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/setup-go
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-03-20 09:23:24 +01:00
Yury Molodov
a2af2e5a1b vmui: improve usability of date/time picker (#3968)
* vmui: allow manually set input date and time
* vmui/docs: improve usability of date/time picker
2023-03-20 09:22:49 +01:00
Dmytro Kozlov
5c92022cc6 lib/storage: fix collect downsampling metrics (#489)
* lib/storage: fix downsampling

* lib/storage: update logic

* lib/storage: fix comments, removed unneeded check
2023-03-19 23:34:46 -07:00
Aliaksandr Valialkin
43b24164ef all: add Windows build for VictoriaMetrics
This commit changes background merge algorithm, so it becomes compatible with Windows file semantics.

The previous algorithm for background merge:

1. Merge source parts into a destination part inside tmp directory.
2. Create a file in txn directory with instructions on how to atomically
   swap source parts with the destination part.
3. Perform instructions from the file.
4. Delete the file with instructions.

This algorithm guarantees that either source parts or destination part
is visible in the partition after unclean shutdown at any step above,
since the remaining files with instructions is replayed on the next restart,
after that the remaining contents of the tmp directory is deleted.

Unfortunately this algorithm doesn't work under Windows because
it disallows removing and moving files, which are in use.

So the new algorithm for background merge has been implemented:

1. Merge source parts into a destination part inside the partition directory itself.
   E.g. now the partition directory may contain both complete and incomplete parts.
2. Atomically update the parts.json file with the new list of parts after the merge,
   e.g. remove the source parts from the list and add the destination part to the list
   before storing it to parts.json file.
3. Remove the source parts from disk when they are no longer used.

This algorithm guarantees that either source parts or destination part
is visible in the partition after unclean shutdown at any step above,
since incomplete partitions from step 1 or old source parts from step 3 are removed
on the next startup by inspecting parts.json file.

This algorithm should work under Windows, since it doesn't remove or move files in use.
This algorithm has also the following benefits:

- It should work better for NFS.
- It fits object storage semantics.

The new algorithm changes data storage format, so it is impossible to downgrade
to the previous versions of VictoriaMetrics after upgrading to this algorithm.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3236
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3821
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-19 01:36:51 -07:00
Aliaksandr Valialkin
6460475e3b lib/{mergeset,storage}: prevent from long wait time when creating a snapshot under high data ingestion rate
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3873
2023-03-19 00:15:30 -07:00
Aliaksandr Valialkin
6d56149b9f deployment/docker/Makefile: properly add amd64 suffix to windows binary names 2023-03-18 23:29:44 -07:00
Aliaksandr Valialkin
4ea27d6f6a deployment/docker/Makefile: build CGO-enabled vmagent for GOARCH=arm64
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2271

This is a follow-up for 565497fb074321caedea38d5151044d98d92d759
2023-03-18 23:15:31 -07:00
Aliaksandr Valialkin
f3c7302772 SECURITY.md: update the list of VictoriaMetrics versions, which support security updates 2023-03-18 12:28:17 -07:00
Aliaksandr Valialkin
a26c6628fd lib/{fs,mergeset,storage}: substitute os.Open()+os.File.Readdir() with os.ReadDir()
This simplifies code a bit
2023-03-17 21:03:37 -07:00
Roman Khavronenko
8fdd613f25 Vmalert tests (#3975)
* vmalert: add tests for notifier pkg

* vmalert: add tests for remotewrite pkg

* vmalert: add tests for template functions

* vmalert: add tests for web pages

* vmalert: fix int overflow in tests

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-17 15:57:24 +01:00
Alexander Marshalov
57b00bafc9 updated api doc for operator (#3972)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-03-17 10:42:30 +01:00
Zakhar Bessarab
ac3043ff74 doc/vmgateway-grafana-openid-guide: fix formatting, add reproducible example and example results (#3964) 2023-03-17 09:57:10 +01:00
Roman Khavronenko
d3608be313 alerts: add TooManyTSIDMisses alerting rule (#3959)
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502#issuecomment-1358374954

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2023-03-17 09:46:51 +01:00
Aliaksandr Valialkin
fdbb819195 all: typo fix in the same way as in e566d49e3a: 8248 -> 8428 2023-03-16 22:06:38 -07:00
oliverpool
fbefc940ef app/vmselect/promql: add test to ensure 8-byte alignment (#3948)
See 0af9e2b693
2023-03-16 09:01:42 -07:00
Pavel Skuratovich
e566d49e3a Fix a typo in README.md so reload scrape config command can be copy-pasted 2023-03-15 22:41:31 +01:00
Artem Navoiev
f8a2a3784b managed quickstart fix typo
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-03-15 22:38:59 +01:00
Artem Navoiev
20aa707979 fix anchor after chaning manager quick start
Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2023-03-15 22:32:20 +01:00
Aliaksandr Valialkin
ddbbc9a86d vendor: make vendor-update 2023-03-15 13:24:12 -07:00
Nikolay
91cbb9063d Vmagent kafka updates (#535)
* app/vmagent: allow vm proto for kafka consumer and producer
it should reduce network usage up to 50%.
According to benchmarks without any encoding at kafka topic, it reduces traffic up to 50%.
With enabled zstd at kafka topic, it shows no diffence in traffic. So it
doesn't make much sense to use it.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1225

* mention eb61a7dd68b834b08d01727a918f207700348ada at changelog

* app/vmagent: bumps kafka lib version
it allows compiling vmagent for arm64 machines
fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2271

* mention d19b1a888248c96cfd7ccee00ba6f596d89be1d7 at change log

* app/vmagent: adds natural concurrency for kafka consumer
it should improve performance for data consumption
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1957

* mention change 0c143bb22ca2e7e0b7eec9bc84a94ee2b41626ca

* Update app/vmagent/kafka/consumer.go

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

* Update app/vmagent/kafka/consumer_cgo.go

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2023-03-15 13:03:44 -07:00
Alexander Marshalov
55afae8641 updated vars doc for operator (#3960)
Signed-off-by: Alexander Marshalov <_@marshalov.org>
2023-03-15 17:28:01 +01:00
dmitryk-dk
0691e115b1 docs: cleanup 2023-03-15 11:54:59 +01:00
dmitryk-dk
32266aaea2 app/vmctl: update managed quickstart guide 2023-03-15 11:54:59 +01:00
Aliaksandr Valialkin
a11ac9648c vendor: make vendor-update 2023-03-14 16:19:43 -07:00
Aliaksandr Valialkin
90e1818068 vendor: update github.com/klauspost/compress from v1.16.0 to v1.16.3 2023-03-14 16:14:25 -07:00
Zakhar Bessarab
8f6d5217d1 doc: add guide for vmgateway configuration with OpenID and Grafana (#3951)
docs: add guide for vmgateway configuration with OpenID and Grafana
2023-03-14 16:19:29 +01:00
Zakhar Bessarab
6a5d236245 lib/storage: log original labels set when label value is truncated (#3952)
lib/storage: log original labels set when label value is truncated
2023-03-14 10:59:40 +01:00
Dmytro Kozlov
4d68f5b1fc app/vmctl: integration test for native protocol (#3947)
* app/vmctl: integration test for native protocol

* app/vmctl: implemented two integration tests

* app/vmctl: cleanup

* app/vmctl: split storage init and filling data logic

* app/vmctl: cleanup

* app/vmctl: remove storage from server, used initialization process

* app/vmctl: prepare for parallel run, code cleanup

* app/vmctl: code cleanup

* app/vmctl: remove unused field
2023-03-14 09:55:49 +01:00
Aliaksandr Valialkin
6f6333831e docs/Single-server-VictoriaMetrics.md: clarify that the cache directory can be removed manually when VictoriaMetrics is stopped 2023-03-13 00:23:40 -07:00
Aliaksandr Valialkin
3e7bfe1200 docs/CHANGELOG.md: document v1.87.3 2023-03-13 00:20:51 -07:00
Aliaksandr Valialkin
02ffe05750 docs/CHANGELOG.md: document v1.79.11 LTS release 2023-03-12 23:22:53 -07:00
Aliaksandr Valialkin
b9e79250b3 deployment: update VictoriaMetrics release from v1.88.0 to v1.89.1
See https://docs.victoriametrics.com/CHANGELOG.html#v1891
2023-03-12 20:05:03 -07:00
1310 changed files with 81276 additions and 26427 deletions

View File

@@ -17,7 +17,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.20.2
go-version: 1.20.5
id: go
- name: Code checkout
uses: actions/checkout@master

View File

@@ -55,9 +55,9 @@ jobs:
uses: actions/checkout@v3
- name: Set up Go
uses: actions/setup-go@v3
uses: actions/setup-go@v4
with:
go-version: 1.20.2
go-version: 1.20.5
check-latest: true
cache: true
if: ${{ matrix.language == 'go' }}

View File

@@ -30,9 +30,9 @@ jobs:
uses: actions/checkout@v3
- name: Setup Go
uses: actions/setup-go@v3
uses: actions/setup-go@v4
with:
go-version: 1.20.2
go-version: 1.20.5
check-latest: true
cache: true
@@ -54,9 +54,9 @@ jobs:
uses: actions/checkout@v3
- name: Setup Go
uses: actions/setup-go@v3
uses: actions/setup-go@v4
with:
go-version: 1.20.2
go-version: 1.20.5
check-latest: true
cache: true
@@ -79,9 +79,9 @@ jobs:
- name: Setup Go
id: go
uses: actions/setup-go@v3
uses: actions/setup-go@v4
with:
go-version: 1.20.2
go-version: 1.20.5
check-latest: true
cache: true

View File

@@ -1,48 +0,0 @@
name: nightly-build
on:
schedule:
# Daily at 2:48am
- cron: '48 2 * * *'
permissions:
contents: read
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.20.2
id: go
- name: Setup docker scan
run: |
mkdir -p ~/.docker/cli-plugins && \
curl https://github.com/docker/scan-cli-plugin/releases/latest/download/docker-scan_linux_amd64 -L -s -S -o ~/.docker/cli-plugins/docker-scan &&\
chmod +x ~/.docker/cli-plugins/docker-scan
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Code checkout
uses: actions/checkout@master
- uses: actions/cache@v3
with:
path: gocache-for-docker
key: gocache-docker-${{ runner.os }}-${{ steps.go.outputs.go-version }}-${{ hashFiles('go.mod') }}
- name: build & publish
run: |
docker scan --severity=medium --login --token "$SNYK_TOKEN" --accept-license
LATEST_TAG=nightly PKG_TAG=nightly make publish
env:
SNYK_TOKEN: ${{ secrets.SNYK_AUTH_TOKEN }}

77
.github/workflows/update-sandbox.yml vendored Normal file
View File

@@ -0,0 +1,77 @@
name: sandbox-release
on:
release:
types: [published]
permissions:
contents: write
jobs:
deploy-sandbox:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v3
with:
repository: VictoriaMetrics/ops
ref: master
token: ${{ secrets.VM_BOT_GH_TOKEN }}
- name: Import GPG key
id: import-gpg
uses: crazy-max/ghaction-import-gpg@v5
with:
gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }}
passphrase: ${{ secrets.VM_BOT_PASSPHRASE }}
git_user_signingkey: true
git_commit_gpgsign: true
- name: update image tag
uses: fjogeleit/yaml-update-action@main
with:
valueFile: 'gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml'
commitChange: false
createPR: false
changes: |
{
"gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml": {
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
},
"gcp-test/sandbox/manifests/benchmark-vm/vmsingle.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
},
"gcp-test/sandbox/manifests/monitoring/monitoring-vmagent.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}"
},
"gcp-test/sandbox/manifests/monitoring/monitoring-vmcluster.yaml": {
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
},
"gcp-test/sandbox/manifests/monitoring/vmalert.yaml": {
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
}
}
- name: commit changes
run: |
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
git add .
git commit -S -m "Deploy image tag ${RELEASE_TAG} to sandbox"
env:
RELEASE_TAG: ${{ github.event.release.tag_name }}
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
branch: release-automation
token: ${{ secrets.VM_BOT_GH_TOKEN }}
delete-branch: true
title: "release ${{ github.event.release.tag_name }}"
body: |
Release [${{ github.event.release.tag_name }}](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/${{ github.event.release.tag_name }}) to sandbox
> Auto-generated by `Github Actions Bot`

View File

@@ -31,7 +31,7 @@ all: \
clean:
rm -rf bin/*
publish: docker-scan \
publish: package-base \
publish-victoria-metrics \
publish-vmagent \
publish-vmalert \
@@ -141,6 +141,8 @@ vmutils-windows-amd64: \
vmagent-windows-amd64 \
vmalert-windows-amd64 \
vmauth-windows-amd64 \
vmbackup-windows-amd64 \
vmrestore-windows-amd64 \
vmctl-windows-amd64
victoria-metrics-crossbuild: \
@@ -186,7 +188,8 @@ release-victoria-metrics: \
release-victoria-metrics-darwin-amd64 \
release-victoria-metrics-darwin-arm64 \
release-victoria-metrics-freebsd-amd64 \
release-victoria-metrics-openbsd-amd64
release-victoria-metrics-openbsd-amd64 \
release-victoria-metrics-windows-amd64
# adds i386 arch
release-victoria-metrics-linux-386:
@@ -213,6 +216,9 @@ release-victoria-metrics-freebsd-amd64:
release-victoria-metrics-openbsd-amd64:
GOOS=openbsd GOARCH=amd64 $(MAKE) release-victoria-metrics-goos-goarch
release-victoria-metrics-windows-amd64:
GOARCH=amd64 $(MAKE) release-victoria-metrics-windows-goarch
release-victoria-metrics-goos-goarch: victoria-metrics-$(GOOS)-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf victoria-metrics-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
@@ -222,6 +228,16 @@ release-victoria-metrics-goos-goarch: victoria-metrics-$(GOOS)-$(GOARCH)-prod
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > victoria-metrics-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf victoria-metrics-$(GOOS)-$(GOARCH)-prod
release-victoria-metrics-windows-goarch: victoria-metrics-windows-$(GOARCH)-prod
cd bin && \
zip victoria-metrics-windows-$(GOARCH)-$(PKG_TAG).zip \
victoria-metrics-windows-$(GOARCH)-prod.exe \
&& sha256sum victoria-metrics-windows-$(GOARCH)-$(PKG_TAG).zip \
victoria-metrics-windows-$(GOARCH)-prod.exe \
> victoria-metrics-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
victoria-metrics-windows-$(GOARCH)-prod.exe
release-vmutils: \
release-vmutils-linux-386 \
release-vmutils-linux-amd64 \
@@ -295,26 +311,33 @@ release-vmutils-windows-goarch: \
vmagent-windows-$(GOARCH)-prod \
vmalert-windows-$(GOARCH)-prod \
vmauth-windows-$(GOARCH)-prod \
vmbackup-windows-$(GOARCH)-prod \
vmrestore-windows-$(GOARCH)-prod \
vmctl-windows-$(GOARCH)-prod
cd bin && \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@@ -395,27 +418,37 @@ check-licenses: install-wwhrd
wwhrd check -f .wwhrd.yml
copy-docs:
echo '' > ${DST}
echo "---" > ${DST}
@if [ ${ORDER} -ne 0 ]; then \
echo "---\nsort: ${ORDER}\n---\n" > ${DST}; \
echo "sort: ${ORDER}" >> ${DST}; \
echo "weight: ${ORDER}" >> ${DST}; \
echo "menu:\n docs:\n parent: 'victoriametrics'\n weight: ${ORDER}" >> ${DST}; \
fi
echo "title: ${TITLE}" >> ${DST}
@if [ ${OLD_URL} ]; then \
echo "aliases:\n - ${OLD_URL}" >> ${DST}; \
fi
echo "---" >> ${DST}
cat ${SRC} >> ${DST}
sed -i='.tmp' 's/<img src=\"docs\//<img src=\"/' ${DST}
rm -rf docs/*.tmp
# Copies docs for all components and adds the order tag.
# For ORDER=0 it adds no order tag.
# Copies docs for all components and adds the order/weight tag, title, menu position and alias with the backward compatible link for the old site.
# For ORDER=0 it adds no order tag/weight tag.
# FOR OLD_URL - relative link, used for backward compatibility with the link from documentation based on GitHub pages (old one)
# FOR OLD_URL='' it adds no alias, it should be empty for every new page, don't change it for already existing links.
# Images starting with <img src="docs/ are replaced with <img src="
# Cluster docs are supposed to be ordered as 9th.
# Cluster docs are supposed to be ordered as 2nd.
# The rest of docs is ordered manually.
docs-sync:
SRC=README.md DST=docs/README.md ORDER=0 $(MAKE) copy-docs
SRC=README.md DST=docs/Single-server-VictoriaMetrics.md ORDER=1 $(MAKE) copy-docs
SRC=app/vmagent/README.md DST=docs/vmagent.md ORDER=3 $(MAKE) copy-docs
SRC=app/vmalert/README.md DST=docs/vmalert.md ORDER=4 $(MAKE) copy-docs
SRC=app/vmauth/README.md DST=docs/vmauth.md ORDER=5 $(MAKE) copy-docs
SRC=app/vmbackup/README.md DST=docs/vmbackup.md ORDER=6 $(MAKE) copy-docs
SRC=app/vmrestore/README.md DST=docs/vmrestore.md ORDER=7 $(MAKE) copy-docs
SRC=app/vmctl/README.md DST=docs/vmctl.md ORDER=8 $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md ORDER=9 $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md ORDER=10 $(MAKE) copy-docs
SRC=README.md DST=docs/README.md OLD_URL='' ORDER=0 TITLE=VictoriaMetrics $(MAKE) copy-docs
SRC=README.md DST=docs/Single-server-VictoriaMetrics.md OLD_URL='/Single-server-VictoriaMetrics.html' TITLE=VictoriaMetrics ORDER=1 $(MAKE) copy-docs
SRC=app/vmagent/README.md DST=docs/vmagent.md OLD_URL='/vmagent.html' ORDER=3 TITLE=vmagent $(MAKE) copy-docs
SRC=app/vmalert/README.md DST=docs/vmalert.md OLD_URL='/vmalert.html' ORDER=4 TITLE=vmalert $(MAKE) copy-docs
SRC=app/vmauth/README.md DST=docs/vmauth.md OLD_URL='/vmauth.html' ORDER=5 TITLE=vmauth $(MAKE) copy-docs
SRC=app/vmbackup/README.md DST=docs/vmbackup.md OLD_URL='/vmbackup.html' ORDER=6 TITLE=vmbackup $(MAKE) copy-docs
SRC=app/vmrestore/README.md DST=docs/vmrestore.md OLD_URL='/vmrestore.html' ORDER=7 TITLE=vmrestore $(MAKE) copy-docs
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs

238
README.md
View File

@@ -40,7 +40,8 @@ VictoriaMetrics has the following prominent features:
* It can be used as long-term storage for Prometheus. See [these docs](#prometheus-setup) for details.
* It can be used as a drop-in replacement for Prometheus in Grafana, because it supports [Prometheus querying API](#prometheus-querying-api-usage).
* It can be used as a drop-in replacement for Graphite in Grafana, because it supports [Graphite API](#graphite-api-usage).
* It features easy setup and operation:
VictoriaMetrics allows reducing infrastructure costs by more than 10x comparing to Graphite - see [this case study](https://docs.victoriametrics.com/CaseStudies.html#grammarly).
* It is easy to setup and operate:
* VictoriaMetrics consists of a single [small executable](https://medium.com/@valyala/stripping-dependency-bloat-in-victoriametrics-docker-image-983fb5912b0d)
without external dependencies.
* All the configuration is done via explicit command-line flags with reasonable defaults.
@@ -201,7 +202,7 @@ Changing scrape configuration is possible with text editor:
vi $SNAP_DATA/var/snap/victoriametrics/current/etc/victoriametrics-scrape-config.yaml
```
After changes were made, trigger config re-read with the command `curl 127.0.0.1:8248/-/reload`.
After changes were made, trigger config re-read with the command `curl 127.0.0.1:8428/-/reload`.
## Prometheus setup
@@ -281,7 +282,11 @@ http://<victoriametrics-addr>:8428
Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
Then build graphs and dashboards for the created datasource using [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/) or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
Then build graphs and dashboards for the created datasource using [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/)
or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
Alternatively, use VictoriaMetrics [datasource plugin](https://github.com/VictoriaMetrics/grafana-datasource) with support of extra features.
See more in [description](https://github.com/VictoriaMetrics/grafana-datasource#victoriametrics-data-source-for-grafana).
## How to upgrade VictoriaMetrics
@@ -302,13 +307,21 @@ Prometheus doesn't drop data during VictoriaMetrics restart. See [this article](
## vmui
VictoriaMetrics provides UI for query troubleshooting and exploration. The UI is available at `http://victoriametrics:8428/vmui`.
The UI allows exploring query results via graphs and tables.
It also provides the following features:
The UI allows exploring query results via graphs and tables. It also provides the following features:
- [metrics explorer](#metrics-explorer)
- [cardinality explorer](#cardinality-explorer)
- [query tracer](#query-tracing)
- [top queries explorer](#top-queries)
- Explore:
- [Metrics explorer](#metrics-explorer) - automatically builds graphs for selected metrics;
- [Cardinality explorer](#cardinality-explorer) - stats about existing metrics in TSDB;
- [Top queries](#top-queries) - shows most frequently executed queries;
- Tools:
- [Trace analyzer](#query-tracing) - playground for loading query traces in JSON format;
- [WITH expressions playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/expand-with-exprs) - test how WITH expressions work;
- [Metric relabel debugger](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/relabeling) - playground for [relabeling](#relabeling) configs.
VMUI automatically switches from graph view to heatmap view when the query returns [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram) buckets
(both [Prometheus histograms](https://prometheus.io/docs/concepts/metric_types/#histogram)
and [VictoriaMetrics histograms](https://valyala.medium.com/improving-histogram-usability-for-prometheus-and-grafana-bc7e5df0e350) are supported).
Try, for example, [this query](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/?g0.expr=sum%28rate%28vm_promscrape_scrape_duration_seconds_bucket%29%29+by+%28vmrange%29&g0.range_input=24h&g0.end_input=2023-04-10T17%3A46%3A12&g0.relative_time=last_24_hours&g0.step_input=31m).
Graphs in `vmui` support scrolling and zooming:
@@ -320,9 +333,12 @@ Query history can be navigated by holding `Ctrl` (or `Cmd` on MacOS) and pressin
Multi-line queries can be entered by pressing `Shift-Enter` in query input field.
When querying the [backfilled data](https://docs.victoriametrics.com/#backfilling) or during [query troubleshooting](https://docs.victoriametrics.com/Troubleshooting.html#unexpected-query-results), it may be useful disabling response cache by clicking `Disable cache` checkbox.
When querying the [backfilled data](https://docs.victoriametrics.com/#backfilling)
or during [query troubleshooting](https://docs.victoriametrics.com/Troubleshooting.html#unexpected-query-results),
it may be useful disabling response cache by clicking `Disable cache` checkbox.
VMUI automatically adjusts the interval between datapoints on the graph depending on the horizontal resolution and on the selected time range. The step value can be customized by changing `Step value` input.
VMUI automatically adjusts the interval between datapoints on the graph depending on the horizontal resolution and on the selected time range.
The step value can be customized by changing `Step value` input.
VMUI allows investigating correlations between multiple queries on the same graph. Just click `Add Query` button,
enter an additional query in the newly appeared input field and press `Enter`.
@@ -366,8 +382,8 @@ VictoriaMetrics provides an ability to explore time series cardinality at `Explo
may show lower than expected number of unique label values for labels with small number of unique values.
This is because of [implementation limits](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/5a6e617b5e41c9170e7c562aecd15ee0c901d489/app/vmselect/netstorage/netstorage.go#L1039-L1045).
By default cardinality explorer analyzes time series for the current date. It provides the ability to select different day at the top right corner.
By default all the time series for the selected date are analyzed. It is possible to narrow down the analysis to series
By default, cardinality explorer analyzes time series for the current date. It provides the ability to select different day at the top right corner.
By default, all the time series for the selected date are analyzed. It is possible to narrow down the analysis to series
matching the specified [series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors).
Cardinality explorer is built on top of [/api/v1/status/tsdb](#tsdb-stats).
@@ -512,8 +528,10 @@ and stream plain InfluxDB line protocol data to the configured TCP and/or UDP ad
VictoriaMetrics performs the following transformations to the ingested InfluxDB data:
* [db query arg](https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint) is mapped into `db` label value
unless `db` tag exists in the InfluxDB line. The `db` label name can be overridden via `-influxDBLabel` command-line flag.
* [db query arg](https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint) is mapped into `db`
[label](https://docs.victoriametrics.com/keyConcepts.html#labels) value unless `db` tag exists in the InfluxDB line.
The `db` label name can be overridden via `-influxDBLabel` command-line flag. If more strict data isolation is required,
read more about multi-tenancy [here](https://docs.victoriametrics.com/keyConcepts.html#multi-tenancy).
* Field names are mapped to time series names prefixed with `{measurement}{separator}` value, where `{separator}` equals to `_` by default. It can be changed with `-influxMeasurementFieldSeparator` command-line flag. See also `-influxSkipSingleField` command-line flag. If `{measurement}` is empty or if `-influxSkipMeasurement` command-line flag is set, then time series names correspond to field names.
* Field values are mapped to time series values.
* Tags are mapped to Prometheus labels as-is.
@@ -564,7 +582,8 @@ The `/api/v1/export` endpoint should return the following response:
```
Note that InfluxDB line protocol expects [timestamps in *nanoseconds* by default](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/#timestamp),
while VictoriaMetrics stores them with *milliseconds* precision.
while VictoriaMetrics stores them with *milliseconds* precision. It is allowed to ingest timestamps with seconds,
microseconds or nanoseconds precision - VictoriaMetrics will automatically convert them to milliseconds.
Extra labels may be added to all the written time series by passing `extra_label=name=value` query args.
For example, `/write?extra_label=foo=bar` would add `{foo="bar"}` label to all the ingested metrics.
@@ -627,7 +646,6 @@ The `__graphite__` pseudo-label supports e.g. alternate regexp filters such as `
VictoriaMetrics also supports Graphite query language - see [these docs](#graphite-render-api-usage).
## How to send data from OpenTSDB-compatible agents
VictoriaMetrics supports [telnet put protocol](http://opentsdb.net/docs/build/html/api_telnet/put.html)
@@ -742,20 +760,45 @@ All the Prometheus querying API handlers can be prepended with `/prometheus` pre
### Prometheus querying API enhancements
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg, which can be used for enforcing additional label filters for queries. For example,
`/api/v1/query_range?extra_label=user_id=123&extra_label=group_id=456&query=<query>` would automatically add `{user_id="123",group_id="456"}` label filters to the given `<query>`. This functionality can be used for limiting the scope of time series visible to the given tenant. It is expected that the `extra_label` query args are automatically set by auth proxy sitting in front of VictoriaMetrics. See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg, which can be used
for enforcing additional label filters for queries. For example, `/api/v1/query_range?extra_label=user_id=123&extra_label=group_id=456&query=<query>`
would automatically add `{user_id="123",group_id="456"}` label filters to the given `<query>`.
This functionality can be used for limiting the scope of time series visible to the given tenant.
It is expected that the `extra_label` query args are automatically set by auth proxy sitting in front of VictoriaMetrics.
See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
VictoriaMetrics accepts optional `extra_filters[]=series_selector` query arg, which can be used for enforcing arbitrary label filters for queries. For example,
`/api/v1/query_range?extra_filters[]={env=~"prod|staging",user="xyz"}&query=<query>` would automatically add `{env=~"prod|staging",user="xyz"}` label filters to the given `<query>`. This functionality can be used for limiting the scope of time series visible to the given tenant. It is expected that the `extra_filters[]` query args are automatically set by auth proxy sitting in front of VictoriaMetrics. See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
VictoriaMetrics accepts optional `extra_filters[]=series_selector` query arg, which can be used for enforcing arbitrary label filters for queries.
For example, `/api/v1/query_range?extra_filters[]={env=~"prod|staging",user="xyz"}&query=<query>` would automatically
add `{env=~"prod|staging",user="xyz"}` label filters to the given `<query>`. This functionality can be used for limiting
the scope of time series visible to the given tenant. It is expected that the `extra_filters[]` query args are automatically
set by auth proxy sitting in front of VictoriaMetrics.
See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
VictoriaMetrics accepts multiple formats for `time`, `start` and `end` query args - see [these docs](#timestamp-formats).
VictoriaMetrics accepts `round_digits` query arg for `/api/v1/query` and `/api/v1/query_range` handlers. It can be used for rounding response values to the given number of digits after the decimal point. For example, `/api/v1/query?query=avg_over_time(temperature[1h])&round_digits=2` would round response values to up to two digits after the decimal point.
VictoriaMetrics accepts `round_digits` query arg for [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query)
and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) handlers. It can be used for rounding response values
to the given number of digits after the decimal point.
For example, `/api/v1/query?query=avg_over_time(temperature[1h])&round_digits=2` would round response values to up to two digits after the decimal point.
VictoriaMetrics accepts `limit` query arg for `/api/v1/labels` and `/api/v1/label/<labelName>/values` handlers for limiting the number of returned entries. For example, the query to `/api/v1/labels?limit=5` returns a sample of up to 5 unique labels, while ignoring the rest of labels. If the provided `limit` value exceeds the corresponding `-search.maxTagKeys` / `-search.maxTagValues` command-line flag values, then limits specified in the command-line flags are used.
VictoriaMetrics accepts `limit` query arg for [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels)
and [`/api/v1/label/<labelName>/values`](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues) handlers for limiting the number of returned entries.
For example, the query to `/api/v1/labels?limit=5` returns a sample of up to 5 unique labels, while ignoring the rest of labels.
If the provided `limit` value exceeds the corresponding `-search.maxTagKeys` / `-search.maxTagValues` command-line flag values,
then limits specified in the command-line flags are used.
By default, VictoriaMetrics returns time series for the last 5 minutes from `/api/v1/series`, `/api/v1/labels` and `/api/v1/label/<labelName>/values` while the Prometheus API defaults to all time. Explicitly set `start` and `end` to select the desired time range.
VictoriaMetrics accepts `limit` query arg for `/api/v1/series` handlers for limiting the number of returned entries. For example, the query to `/api/v1/series?limit=5` returns a sample of up to 5 series, while ignoring the rest. If the provided `limit` value exceeds the corresponding `-search.maxSeries` command-line flag values, then limits specified in the command-line flags are used.
By default, VictoriaMetrics returns time series for the last day starting at 00:00 UTC
from [/api/v1/series](https://docs.victoriametrics.com/url-examples.html#apiv1series),
[/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and
[`/api/v1/label/<labelName>/values`](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues),
while the Prometheus API defaults to all time. Explicitly set `start` and `end` to select the desired time range.
VictoriaMetrics rounds the specified `start..end` time range to day granularity because of performance optimization concerns.
If you need the exact set of label names and label values on the given time range, then send queries
to [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) or to [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query).
VictoriaMetrics accepts `limit` query arg at [/api/v1/series](https://docs.victoriametrics.com/url-examples.html#apiv1series)
for limiting the number of returned entries. For example, the query to `/api/v1/series?limit=5` returns a sample of up to 5 series, while ignoring the rest of series.
If the provided `limit` value exceeds the corresponding `-search.maxSeries` command-line flag values, then limits specified in the command-line flags are used.
Additionally, VictoriaMetrics provides the following handlers:
@@ -780,9 +823,11 @@ in [query APIs](https://docs.victoriametrics.com/#prometheus-querying-api-usage)
in [export APIs](https://docs.victoriametrics.com/#how-to-export-time-series).
- Unix timestamps in seconds with optional milliseconds after the point. For example, `1562529662.678`.
- [RFC3339](https://www.ietf.org/rfc/rfc3339.txt). For example, '2022-03-29T01:02:03Z`.
- Partial RFC3339. Examples: `2022`, `2022-03`, `2022-03-29`, `2022-03-29T01`, `2022-03-29T01:02`.
- Relative duration comparing to the current time. For example, `1h5m` means `one hour and five minutes ago`.
- [RFC3339](https://www.ietf.org/rfc/rfc3339.txt). For example, `2022-03-29T01:02:03Z` or `2022-03-29T01:02:03+02:30`.
- Partial RFC3339. Examples: `2022`, `2022-03`, `2022-03-29`, `2022-03-29T01`, `2022-03-29T01:02`, `2022-03-29T01:02:03`.
The partial RFC3339 time is in UTC timezone by default. It is possible to specify timezone there by adding `+hh:mm` or `-hh:mm` suffix to partial time.
For example, `2022-03-01+06:30` is `2022-03-01` at `06:30` timezone.
- Relative duration comparing to the current time. For example, `1h5m`, `-1h5m` or `now-1h5m` means `one hour and five minutes ago`, while `now` means `now`.
## Graphite API usage
@@ -804,10 +849,10 @@ VictoriaMetrics supports `__graphite__` pseudo-label for filtering time series w
### Graphite Render API usage
[VictoriaMetrics Enterprise](https://docs.victoriametrics.com/enterprise.html) supports [Graphite Render API](https://graphite.readthedocs.io/en/stable/render_api.html) subset
VictoriaMetrics supports [Graphite Render API](https://graphite.readthedocs.io/en/stable/render_api.html) subset
at `/render` endpoint, which is used by [Graphite datasource in Grafana](https://grafana.com/docs/grafana/latest/datasources/graphite/).
When configuring Graphite datasource in Grafana, the `Storage-Step` http request header must be set to a step between Graphite data points stored in VictoriaMetrics. For example, `Storage-Step: 10s` would mean 10 seconds distance between Graphite datapoints stored in VictoriaMetrics.
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
When configuring Graphite datasource in Grafana, the `Storage-Step` http request header must be set to a step between Graphite data points
stored in VictoriaMetrics. For example, `Storage-Step: 10s` would mean 10 seconds distance between Graphite datapoints stored in VictoriaMetrics.
### Graphite Metrics API usage
@@ -819,7 +864,7 @@ VictoriaMetrics supports the following handlers from [Graphite Metrics API](http
VictoriaMetrics accepts the following additional query args at `/metrics/find` and `/metrics/expand`:
* `label` - for selecting arbitrary label values. By default `label=__name__`, i.e. metric names are selected.
* `label` - for selecting arbitrary label values. By default, `label=__name__`, i.e. metric names are selected.
* `delimiter` - for using different delimiters in metric name hierarchy. For example, `/metrics/find?delimiter=_&query=node_*` would return all the metric name prefixes
that start with `node_`. By default `delimiter=.`.
@@ -945,7 +990,7 @@ Note that background merges may never occur for data from previous months, so st
In this case [forced merge](#forced-merge) may help freeing up storage space.
It is recommended verifying which metrics will be deleted with the call to `http://<victoria-metrics-addr>:8428/api/v1/series?match[]=<timeseries_selector_for_delete>`
before actually deleting the metrics. By default this query will only scan series in the past 5 minutes, so you may need to
before actually deleting the metrics. By default, this query will only scan series in the past 5 minutes, so you may need to
adjust `start` and `end` to a suitable range to achieve match hits.
The `/api/v1/admin/tsdb/delete_series` handler may be protected with `authKey` if `-deleteAuthKey` command-line flag is set.
@@ -1311,7 +1356,8 @@ Example contents for `-relabelConfig` file:
VictoriaMetrics provides additional relabeling features such as Graphite-style relabeling.
See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling) for more details.
The relabeling can be debugged at `http://victoriametrics:8428/metric-relabel-debug` page.
The relabeling can be debugged at `http://victoriametrics:8428/metric-relabel-debug` page
or at our [public playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/relabeling).
See [these docs](https://docs.victoriametrics.com/vmagent.html#relabel-debug) for more details.
@@ -1358,7 +1404,7 @@ See also [resource usage limits docs](#resource-usage-limits).
## Resource usage limits
By default VictoriaMetrics is tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
By default, VictoriaMetrics is tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
- `-memory.allowedPercent` and `-memory.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query. Queries, which need more memory, are rejected. Heavy queries, which select big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
@@ -1420,22 +1466,37 @@ with the enabled de-duplication. See [this section](#deduplication) for details.
## Deduplication
VictoriaMetrics leaves a single raw sample with the biggest timestamp per each `-dedup.minScrapeInterval` discrete interval
if `-dedup.minScrapeInterval` is set to positive duration. For example, `-dedup.minScrapeInterval=60s` would leave a single
raw sample with the biggest timestamp per each discrete 60s interval.
VictoriaMetrics leaves a single [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
with the biggest [timestamp](https://en.wikipedia.org/wiki/Unix_time) for each [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
per each `-dedup.minScrapeInterval` discrete interval if `-dedup.minScrapeInterval` is set to positive duration.
For example, `-dedup.minScrapeInterval=60s` would leave a single raw sample with the biggest timestamp per each discrete
`60s` interval.
This aligns with the [staleness rules in Prometheus](https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness).
If multiple raw samples have the same biggest timestamp on the given `-dedup.minScrapeInterval` discrete interval, then the sample with the biggest value is left.
If multiple raw samples have **the same timestamp** on the given `-dedup.minScrapeInterval` discrete interval,
then the sample with **the biggest value** is kept.
The `-dedup.minScrapeInterval=D` is equivalent to `-downsampling.period=0s:D` if [downsampling](#downsampling) is enabled. So it is safe to use deduplication and downsampling simultaneously.
Please note, [labels](https://docs.victoriametrics.com/keyConcepts.html#labels) of raw samples should be identical
in order to be deduplicated. For example, this is why [HA pair of vmagents](https://docs.victoriametrics.com/vmagent.html#high-availability)
needs to be identically configured.
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs. It is recommended to have a single `scrape_interval` across all the scrape targets. See [this article](https://www.robustperception.io/keep-it-simple-scrape_interval-id) for details.
The `-dedup.minScrapeInterval=D` is equivalent to `-downsampling.period=0s:D` if [downsampling](#downsampling) is enabled.
So it is safe to use deduplication and downsampling simultaneously.
The de-duplication reduces disk space usage if multiple identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) or Prometheus instances in HA pair
write data to the same VictoriaMetrics instance. These vmagent or Prometheus instances must have identical
`external_labels` section in their configs, so they write data to the same time series. See also [how to set up multiple vmagent instances for scraping the same targets](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets).
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs.
It is recommended to have a single `scrape_interval` across all the scrape targets.
See [this article](https://www.robustperception.io/keep-it-simple-scrape_interval-id) for details.
It is recommended passing different `-promscrape.cluster.name` values to HA pairs of `vmagent` instances, so the de-duplication consistently leaves samples for one `vmagent` instance and removes duplicate samples from other `vmagent` instances. See [these docs](https://docs.victoriametrics.com/vmagent.html#high-availability) for details.
The de-duplication reduces disk space usage if multiple **identically configured** [vmagent](https://docs.victoriametrics.com/vmagent.html)
or Prometheus instances in HA pair write data to the same VictoriaMetrics instance.
These vmagent or Prometheus instances must have **identical** `external_labels` section in their configs,
so they write data to the same time series.
See also [how to set up multiple vmagent instances for scraping the same targets](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets).
It is recommended passing different `-promscrape.cluster.name` values to each distinct HA pair of `vmagent` instances,
so the de-duplication consistently leaves samples for one `vmagent` instance and removes duplicate samples
from other `vmagent` instances.
See [these docs](https://docs.victoriametrics.com/vmagent.html#high-availability) for details.
## Storage
@@ -1447,12 +1508,14 @@ can be configured with the `-inmemoryDataFlushInterval` command-line flag (note
In-memory parts are persisted to disk into `part` directories under the `<-storageDataPath>/data/small/YYYY_MM/` folder,
where `YYYY_MM` is the month partition for the stored data. For example, `2022_11` is the partition for `parts`
with [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) from `November 2022`.
Each partition directory contains `parts.json` file with the actual list of parts in the partition.
The `part` directory has the following name pattern: `rowsCount_blocksCount_minTimestamp_maxTimestamp`, where:
Every `part` directory contains `metadata.json` file with the following fields:
- `rowsCount` - the number of [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) stored in the part
- `blocksCount` - the number of blocks stored in the part (see details about blocks below)
- `minTimestamp` and `maxTimestamp` - minimum and maximum timestamps across raw samples stored in the part
- `RowsCount` - the number of [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) stored in the part
- `BlocksCount` - the number of blocks stored in the part (see details about blocks below)
- `MinTimestamp` and `MaxTimestamp` - minimum and maximum timestamps across raw samples stored in the part
- `MinDedupInterval` - the [deduplication interval](#deduplication) applied to the given part.
Each `part` consists of `blocks` sorted by internal time series id (aka `TSID`).
Each `block` contains up to 8K [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples),
@@ -1474,11 +1537,10 @@ for fast block lookups, which belong to the given `TSID` and cover the given tim
and [freeing up disk space for the deleted time series](#how-to-delete-time-series) are performed during the merge
Newly added `parts` either successfully appear in the storage or fail to appear.
The newly added `parts` are being created in a temporary directory under `<-storageDataPath>/data/{small,big}/YYYY_MM/tmp` folder.
When the newly added `part` is fully written and [fsynced](https://man7.org/linux/man-pages/man2/fsync.2.html)
to a temporary directory, then it is atomically moved to the storage directory.
Thanks to this alogrithm, storage never contains partially created parts, even if hardware power off
occurrs in the middle of writing the `part` to disk - such incompletely written `parts`
The newly added `part` is atomically registered in the `parts.json` file under the corresponding partition
after it is fully written and [fsynced](https://man7.org/linux/man-pages/man2/fsync.2.html) to the storage.
Thanks to this algorithm, storage never contains partially created parts, even if hardware power off
occurs in the middle of writing the `part` to disk - such incompletely written `parts`
are automatically deleted on the next VictoriaMetrics start.
The same applies to merge process — `parts` are either fully merged into a new `part` or fail to merge,
@@ -1505,15 +1567,14 @@ Retention is configured with the `-retentionPeriod` command-line flag, which tak
Data is split in per-month partitions inside `<-storageDataPath>/data/{small,big}` folders.
Data partitions outside the configured retention are deleted on the first day of the new month.
Each partition consists of one or more data parts with the following name pattern `rowsCount_blocksCount_minTimestamp_maxTimestamp`.
Data parts outside of the configured retention are eventually deleted during
Each partition consists of one or more data parts. Data parts outside the configured retention are eventually deleted during
[background merge](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
The maximum disk space usage for a given `-retentionPeriod` is going to be (`-retentionPeriod` + 1) months.
For example, if `-retentionPeriod` is set to 1, data for January is deleted on March 1st.
Please note, the time range covered by data part is not limited by retention period unit. Hence, data part may contain data
for multiple days and will be deleted only when fully outside of the configured retention.
for multiple days and will be deleted only when fully outside the configured retention.
It is safe to extend `-retentionPeriod` on existing data. If `-retentionPeriod` is set to a lower
value than before, then data outside the configured period will be eventually deleted.
@@ -1557,7 +1618,7 @@ For example, the following config sets 3 days retention for time series with `te
Important notes:
- The data outside of the configured retention isn't deleted instantly - it is deleted eventually during [background merges](https://docs.victoriametrics.com/#storage).
- The data outside the configured retention isn't deleted instantly - it is deleted eventually during [background merges](https://docs.victoriametrics.com/#storage).
- The `-retentionFilter` doesn't remove old data from `indexdb` (aka inverted index) until the configured [-retentionPeriod](#retention).
So the `indexdb` size can grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)
even for small retentions configured via `-retentionFilter`.
@@ -1579,6 +1640,10 @@ Retention filters can be evaluated for free by downloading and using enterprise
Downsampling is applied independently per each time series. It can reduce disk space usage and improve query performance if it is applied to time series with big number of samples per each series. The downsampling doesn't improve query performance if the database contains big number of time series with small number of samples per each series (aka [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)), since downsampling doesn't reduce the number of time series. So the majority of time is spent on searching for the matching time series. It is possible to use recording rules in [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to reduce the number of time series. See [these docs](https://docs.victoriametrics.com/vmalert.html#downsampling-and-aggregation-via-vmalert).
Downsampling happens during [background merges](https://docs.victoriametrics.com/#storage)
and can't be performed if there is not enough of free disk space or if vmstorage
is in [read-only mode](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#readonly-mode).
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
## Multi-tenancy
@@ -1899,7 +1964,14 @@ are added to all the metrics before sending them to the remote storage:
## Cache removal
VictoriaMetrics uses various internal caches. These caches are stored to `<-storageDataPath>/cache` directory during graceful shutdown (e.g. when VictoriaMetrics is stopped by sending `SIGINT` signal). The caches are read on the next VictoriaMetrics startup. Sometimes it is needed to remove such caches on the next startup. This can be performed by placing `reset_cache_on_startup` file inside the `<-storageDataPath>/cache` directory before the restart of VictoriaMetrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1447) for details.
VictoriaMetrics uses various internal caches. These caches are stored to `<-storageDataPath>/cache` directory during graceful shutdown
(e.g. when VictoriaMetrics is stopped by sending `SIGINT` signal). The caches are read on the next VictoriaMetrics startup.
Sometimes it is needed to remove such caches on the next startup. This can be done in the following ways:
- By manually removing the `<-storageDataPath>/cache` directory when VictoriaMetrics is stopped.
- By placing `reset_cache_on_startup` file inside the `<-storageDataPath>/cache` directory before the restart of VictoriaMetrics.
In this case VictoriaMetrics will automatically remove all the caches on the next start.
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1447) for details.
## Cache tuning
@@ -2004,9 +2076,9 @@ Enterprise binaries can be downloaded and evaluated for free from [the releases
A single-node VictoriaMetrics is capable of proxying requests to [vmalert](https://docs.victoriametrics.com/vmalert.html)
when `-vmalert.proxyURL` flag is set. Use this feature for the following cases:
* for proxying requests from [Grafana Alerting UI](https://grafana.com/docs/grafana/latest/alerting/);
* for accessing vmalert's UI through single-node VictoriaMetrics Web interface.
* for accessing vmalerts UI through single-node VictoriaMetrics Web interface.
For accessing vmalert's UI through single-node VictoriaMetrics configure `-vmalert.proxyURL` flag and visit
For accessing vmalerts UI through single-node VictoriaMetrics configure `-vmalert.proxyURL` flag and visit
`http://<victoriametrics-addr>:8428/vmalert/` link.
## Benchmarks
@@ -2137,7 +2209,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
```
-bigMergeConcurrency int
The maximum number of CPU cores to use for big merges. Default value is used if set to 0
Deprecated: this flag does nothing. Please use -smallMergeConcurrency for controlling the concurrency of background merges. See https://docs.victoriametrics.com/#storage
-cacheExpireDuration duration
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
-configAuthKey string
@@ -2154,16 +2226,16 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-deleteAuthKey string
authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series and /tags/delSeries
-denyQueriesOutsideRetention
Whether to deny queries outside of the configured -retentionPeriod. When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. This may be useful when multiple data sources with distinct retentions are hidden behind query-tee
Whether to deny queries outside the configured -retentionPeriod. When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. This may be useful when multiple data sources with distinct retentions are hidden behind query-tee
-denyQueryTracing
Whether to disable the ability to trace queries. See https://docs.victoriametrics.com/#query-tracing
-downsampling.period array
Comma-separated downsampling periods in the format 'offset:period'. For example, '30d:10m' instructs to leave a single sample per 10 minutes for samples older than 30 days. See https://docs.victoriametrics.com/#downsampling for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
Supports an array of values separated by comma or specified via multiple flags.
-dryRun
Whether to check only -promscrape.config and then exit. Unknown config entries aren't allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse=false command-line flag
Whether to check config files without running VictoriaMetrics. The following config files are checked: -promscrape.config, -relabelConfig and -streamAggr.config. Unknown config entries aren't allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse=false command-line flag
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -2179,7 +2251,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-forceMergeAuthKey string
authKey, which must be passed in query string to /internal/force_merge pages
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-graphiteListenAddr string
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty. See also -graphiteListenAddr.useProxyProtocol
-graphiteListenAddr.useProxyProtocol
@@ -2189,7 +2261,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -2226,7 +2298,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-influxSkipMeasurement
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
-influxSkipSingleField
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metric name if InfluxDB line contains only a single field
-influxTrimTimestamp duration
Trim timestamps for InfluxDB line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-inmemoryDataFlushInterval duration
@@ -2234,7 +2306,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-insert.maxQueueDuration duration
The maximum duration to wait in the queue when -maxConcurrentInserts concurrent insert requests are executed (default 1m0s)
-internStringCacheExpireDuration duration
The expire duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s)
The expiry duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s)
-internStringDisableCache
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
-internStringMaxLen int
@@ -2297,9 +2369,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberNum string
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
-promscrape.cluster.replicationFactor int
@@ -2311,7 +2383,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-promscrape.config.strictParse
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
@@ -2319,9 +2391,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-promscrape.digitaloceanSDCheckInterval duration
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#digitalocean_sd_configs for details (default 1m0s)
-promscrape.disableCompression
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control
-promscrape.disableKeepAlive
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
-promscrape.discovery.concurrency int
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
-promscrape.discovery.concurrentWaitTime duration
@@ -2372,7 +2444,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-promscrape.seriesLimitPerTarget int
Optional limit on the number of unique time series a single scrape target can expose. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter for more info
-promscrape.streamParse
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is possible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is possible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control
-promscrape.suppressDuplicateScrapeTargetErrors
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
-promscrape.suppressScrapeErrors
@@ -2387,7 +2459,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-relabelConfig string
Optional path to a file with relabeling rules, which are applied to all the ingested metrics. The path can point either to local file or to http url. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal
@@ -2406,9 +2478,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-search.disableCache
Whether to disable response caching. This may be useful during data backfilling
-search.graphiteMaxPointsPerSeries int
The maximum number of points per series Graphite render API can return. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default 1000000)
The maximum number of points per series Graphite render API can return (default 1000000)
-search.graphiteStorageStep duration
The interval between datapoints stored in the database. It is used at Graphite Render API handler for normalizing the interval between datapoints in case it isn't normalized. It can be overridden by sending 'storage_step' query arg to /render API or by sending the desired interval via 'Storage-Step' http header during querying /render API. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default 10s)
The interval between datapoints stored in the database. It is used at Graphite Render API handler for normalizing the interval between datapoints in case it isn't normalized. It can be overridden by sending 'storage_step' query arg to /render API or by sending the desired interval via 'Storage-Step' http header during querying /render API (default 10s)
-search.latencyOffset duration
The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
-search.logQueryMemoryUsage size
@@ -2425,9 +2497,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-search.maxFederateSeries int
The maximum number of time series, which can be returned from /federate. This option allows limiting memory usage (default 1000000)
-search.maxGraphiteSeries int
The maximum number of time series, which can be scanned during queries to Graphite Render API. See https://docs.victoriametrics.com/#graphite-render-api-usage . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default 300000)
The maximum number of time series, which can be scanned during queries to Graphite Render API. See https://docs.victoriametrics.com/#graphite-render-api-usage (default 300000)
-search.maxLookback duration
Synonym to -search.lookback-delta from Prometheus. The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. See also '-search.maxStalenessInterval' flag, which has the same meaining due to historical reasons
Synonym to -search.lookback-delta from Prometheus. The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. See also '-search.maxStalenessInterval' flag, which has the same meaning due to historical reasons
-search.maxMemoryPerQuery size
The maximum amounts of memory a single query may consume. Queries requiring more memory are rejected. The total memory limit for concurrently executed queries can be estimated as -search.maxMemoryPerQuery multiplied by -search.maxConcurrentRequests . See also -search.logQueryMemoryUsage
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
@@ -2451,7 +2523,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-search.maxSeriesPerAggrFunc int
The maximum number of time series an aggregate MetricsQL function can generate (default 1000000)
-search.maxStalenessInterval duration
The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.setLookbackToStep' flag
The maximum interval for staleness calculations. By default, it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.setLookbackToStep' flag
-search.maxStatusRequestDuration duration
The maximum duration for /api/v1/status/* requests (default 5m0s)
-search.maxStepForPointsAdjustment duration
@@ -2487,7 +2559,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-selfScrapeJob string
Value for 'job' label, which is added to self-scraped metrics (default "victoria-metrics")
-smallMergeConcurrency int
The maximum number of CPU cores to use for small merges. Default value is used if set to 0
The maximum number of workers for background merges. See https://docs.victoriametrics.com/#storage . It isn't recommended tuning this flag in general case, since this may lead to uncontrolled increase in the number of parts and increased CPU usage during queries
-snapshotAuthKey string
authKey, which must be passed in query string to /snapshot* pages
-snapshotCreateTimeout duration
@@ -2523,7 +2595,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
-streamAggr.dedupInterval duration
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero
-streamAggr.keepInput
Whether to keep input samples after the aggregation with -streamAggr.config. By default the input is dropped after the aggregation, so only the aggregate data is stored. See https://docs.victoriametrics.com/stream-aggregation.html
Whether to keep input samples after the aggregation with -streamAggr.config. By default, the input is dropped after the aggregation, so only the aggregate data is stored. See https://docs.victoriametrics.com/stream-aggregation.html
-tls
Whether to enable TLS for incoming HTTP requests at -httpListenAddr (aka https). -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string

View File

@@ -4,10 +4,10 @@
| Version | Supported |
|---------|--------------------|
| 1.81.x | :white_check_mark: |
| 1.80.x | :x: |
| 1.79.x | :white_check_mark: |
| < 1.78 | :x: |
| [latest release](https://docs.victoriametrics.com/CHANGELOG.html) | :white_check_mark: |
| v1.87.x LTS release | :white_check_mark: |
| v1.79.x LTS release | :white_check_mark: |
| other releases | :x: |
## Reporting a Vulnerability

View File

@@ -39,6 +39,9 @@ victoria-metrics-freebsd-amd64-prod:
victoria-metrics-openbsd-amd64-prod:
APP_NAME=victoria-metrics $(MAKE) app-via-docker-openbsd-amd64
victoria-metrics-windows-amd64-prod:
APP_NAME=victoria-metrics $(MAKE) app-via-docker-windows-amd64
package-victoria-metrics:
APP_NAME=victoria-metrics $(MAKE) package-via-docker
@@ -100,6 +103,9 @@ victoria-metrics-freebsd-amd64:
victoria-metrics-openbsd-amd64:
APP_NAME=victoria-metrics CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
victoria-metrics-windows-amd64:
GOARCH=amd64 APP_NAME=victoria-metrics $(MAKE) app-local-windows-goarch
victoria-metrics-pure:
APP_NAME=victoria-metrics $(MAKE) app-local-pure

View File

@@ -8,6 +8,8 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
vminsertcommon "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
vminsertrelabel "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/relabel"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
@@ -30,8 +32,9 @@ var (
"With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing")
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the last sample in every time series per each discrete interval "+
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication and https://docs.victoriametrics.com/#downsampling")
dryRun = flag.Bool("dryRun", false, "Whether to check only -promscrape.config and then exit. "+
"Unknown config entries aren't allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse=false command-line flag")
dryRun = flag.Bool("dryRun", false, "Whether to check config files without running VictoriaMetrics. The following config files are checked: "+
"-promscrape.config, -relabelConfig and -streamAggr.config. Unknown config entries aren't allowed in -promscrape.config by default. "+
"This can be changed with -promscrape.config.strictParse=false command-line flag")
inmemoryDataFlushInterval = flag.Duration("inmemoryDataFlushInterval", 5*time.Second, "The interval for guaranteed saving of in-memory data to disk. "+
"The saved data survives unclean shutdown such as OOM crash, hardware reset, SIGKILL, etc. "+
"Bigger intervals may help increasing lifetime of flash storage with limited write cycles (e.g. Raspberry PI). "+
@@ -54,6 +57,12 @@ func main() {
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
if err := vminsertrelabel.CheckRelabelConfig(); err != nil {
logger.Fatalf("error when checking -relabelConfig: %s", err)
}
if err := vminsertcommon.CheckStreamAggrConfig(); err != nil {
logger.Fatalf("error when checking -streamAggr.config: %s", err)
}
logger.Infof("-promscrape.config is ok; exiting with 0 status code")
return
}

View File

@@ -13,7 +13,7 @@ See [Quick Start](#quick-start) for details.
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters into VictoriaMetrics.
Also, we found that our user's infrastructure are like snowflakes in that no two are alike. Therefore we decided to add more flexibility
Also, we found that our user's infrastructure are like snowflakes in that no two are alike. Therefore, we decided to add more flexibility
to `vmagent` such as the ability to [accept metrics via popular push protocols](#how-to-push-data-to-vmagent)
additionally to [discovering Prometheus-compatible targets and scraping metrics from them](#how-to-collect-metrics-in-prometheus-format).
@@ -54,10 +54,19 @@ and sending the data to the Prometheus-compatible remote storage:
The path can point either to local file or to http url. `vmagent` doesn't support some sections of Prometheus config file,
so you may need either to delete these sections or to run `vmagent` with `-promscrape.config.strictParse=false` command-line flag.
In this case `vmagent` ignores unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections).
* `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics, the `-remoteWrite.url` argument can be specified
multiple times to replicate data concurrently to multiple remote storage systems. See [various use cases](#use-cases).
* `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics.
Example command line:
Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent)
to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`:
```console
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format) if you need writing
the data to [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
Example command for scraping Prometheus targets and writing the data to single-node VictoriaMetrics:
```console
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
@@ -68,18 +77,12 @@ See [how to scrape Prometheus-compatible targets](#how-to-collect-metrics-in-pro
If you use single-node VictoriaMetrics, then you can discover and scrape Prometheus-compatible targets directly from VictoriaMetrics
without the need to use `vmagent` - see [these docs](https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter).
If you don't need to scrape Prometheus-compatible targets, then the `-promscrape.config` option isn't needed.
For example, the following command is sufficient for accepting data via [supported push-based protocols](#how-to-push-data-to-vmagent)
and sending it to the provided `-remoteWrite.url`:
```console
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
`vmagent` can save network bandwidth usage costs under high load when [VictoriaMetrics remote write protocol is enabled](#victoriametrics-remote-write-protocol).
`vmagent` can save network bandwidth usage costs under high load when [VictoriaMetrics remote write protocol is used](#victoriametrics-remote-write-protocol).
See [troubleshooting docs](#troubleshooting) if you encounter common issues with `vmagent`.
See [various use cases](#use-cases) for vmagent.
Pass `-help` to `vmagent` in order to see [the full list of supported command-line flags with their descriptions](#advanced-usage).
## How to push data to vmagent
@@ -101,7 +104,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
`vmagent` should be restarted in order to update config options set via command-line args.
`vmagent` supports multiple approaches for reloading configs from updated config files such as
`-promscrape.config`, `-remoteWrite.relabelConfig` and `-remoteWrite.urlRelabelConfig`:
`-promscrape.config`, `-remoteWrite.relabelConfig`, `-remoteWrite.urlRelabelConfig` and `-remoteWrite.streamAggr.config`:
* Sending `SIGHUP` signal to `vmagent` process:
@@ -148,9 +151,14 @@ to other remote storage systems, which support Prometheus `remote_write` protoco
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instance.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again,
and then it sends the buffered data to the remote storage in order to prevent data gaps.
[VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) already supports replication,
so there is no need in specifying multiple `-remoteWrite.url` flags when writing data to the same cluster.
See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#replication-and-data-safety).
### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to the remote storage. Additionally,
@@ -162,7 +170,7 @@ Please see [these docs](#relabeling) for details.
`vmagent` supports splitting the collected data between multiple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For example, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and a real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive it's own subset of the collected data due to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
Note that each destination can receive its own subset of the collected data due to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
### Prometheus remote_write proxy
@@ -174,7 +182,7 @@ Also, Basic Auth can be enabled for the incoming `remote_write` requests with `-
### remote_write for clustered version
While `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets,
writes are always performed in Promethes remote_write protocol. Therefore for the [clustered version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html),
writes are always performed in Prometheus remote_write protocol. Therefore, for the [clustered version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html),
the `-remoteWrite.url` command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<accountID>/prometheus/api/v1/write`
according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format).
There is also support for multitenant writes. See [these docs](#multitenancy).
@@ -228,7 +236,7 @@ This allows using a single `vmagent` instance in front of multiple VictoriaMetri
If `-remoteWrite.multitenantURL` command-line flag is set and `vmagent` is configured to scrape Prometheus-compatible targets
(e.g. if `-promscrape.config` command-line flag is set) then `vmagent` reads tenantID from `__tenant_id__` label
for the discovered targets and routes all the metrics from this target to the given `__tenant_id__`,
e.g. to the url `<-remoteWrite.multitnenatURL>/insert/<__tenant_id__>/prometheus/api/v1/write`.
e.g. to the url `<-remoteWrite.multitenantURL>/insert/<__tenant_id__>/prometheus/api/v1/write`.
For example, the following relabeling rule instructs sending metrics to tenantID defined in the `prometheus.io/tenant` annotation of Kubernetes pod deployment:
@@ -274,14 +282,14 @@ scrape_configs:
- "My-Auth: TopSecret"
```
* `disable_compression: true` for disabling response compression on a per-job basis. By default `vmagent` requests compressed responses
* `disable_compression: true` for disabling response compression on a per-job basis. By default, `vmagent` requests compressed responses
from scrape targets for saving network bandwidth.
* `disable_keepalive: true` for disabling [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection)
on a per-job basis. By default `vmagent` uses keep-alive connections to scrape targets for reducing overhead on connection re-establishing.
on a per-job basis. By default, `vmagent` uses keep-alive connections to scrape targets for reducing overhead on connection re-establishing.
* `series_limit: N` for limiting the number of unique time series a single scrape target can expose. See [these docs](#cardinality-limiter).
* `stream_parse: true` for scraping targets in a streaming manner. This may be useful when targets export big number of metrics. See [these docs](#stream-parsing-mode).
* `scrape_align_interval: duration` for aligning scrapes to the given interval instead of using random offset
in the range `[0 ... scrape_interval]` for scraping each target. The random offset helps spreading scrapes evenly in time.
in the range `[0 ... scrape_interval]` for scraping each target. The random offset helps to spread scrapes evenly in time.
* `scrape_offset: duration` for specifying the exact offset for scraping instead of using random offset in the range `[0 ... scrape_interval]`.
See [scrape_configs docs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) for more details on all the supported options.
@@ -330,7 +338,7 @@ There is no need in specifying top-level `scrape_configs` section in these files
The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format).
Additionally `vmagent` doesn't support `refresh_interval` option at service discovery sections.
Additionally, `vmagent` doesn't support `refresh_interval` option at service discovery sections.
This option is substituted with `-promscrape.*CheckInterval` command-line options, which are specific per each service discovery type.
See [the full list of command-line flags for vmagent](#advanced-usage).
@@ -452,7 +460,7 @@ VictoriaMetrics components support [Prometheus-compatible relabeling](https://pr
with [additional enhancements](#relabeling-enhancements). The relabeling can be defined in the following places processed by `vmagent`:
* At the `scrape_config -> relabel_configs` section in `-promscrape.config` file.
This relabeling is used for modifying labels in discovered targets and for dropping unneded targets.
This relabeling is used for modifying labels in discovered targets and for dropping unneeded targets.
See [relabeling cookbook](https://docs.victoriametrics.com/relabeling.html) for details.
This relabeling can be debugged by clicking the `debug` link at the corresponding target on the `http://vmagent:8429/targets` page
@@ -539,7 +547,7 @@ The following articles contain useful information about Prometheus relabeling:
* VictoriaMetrics provides the following additional relabeling actions on top of standard actions
from the [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config):
* `replace_all` replaces all of the occurrences of `regex` in the values of `source_labels` with the `replacement`
* `replace_all` replaces all the occurrences of `regex` in the values of `source_labels` with the `replacement`
and stores the results in the `target_label`. For example, the following relabeling config replaces all the occurrences
of `-` char in metric names with `_` char (e.g. `foo-bar-baz` metric name is transformed into `foo_bar_baz`):
@@ -551,7 +559,7 @@ The following articles contain useful information about Prometheus relabeling:
replacement: "_"
```
* `labelmap_all` replaces all of the occurrences of `regex` in all the label names with the `replacement`.
* `labelmap_all` replaces all the occurrences of `regex` in all the label names with the `replacement`.
For example, the following relabeling config replaces all the occurrences of `-` char in all the label names
with `_` char (e.g. `foo-bar-baz` label name is transformed into `foo_bar_baz`):
@@ -637,18 +645,19 @@ provide the following tools for debugging target-level and metric-level relabeli
- Target-level debugging (e.g. `relabel_configs` section at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs))
can be performed by navigating to `http://vmagent:8429/targets` page (`http://victoriametrics:8428/targets` page for single-node VictoriaMetrics)
and clicking the `debug target relabeling` link at the target, which must be debugged.
The opened page will show step-by-step results for the actual target relabeling rules applied to the discovered target labels.
The opened page shows step-by-step results for the actual target relabeling rules applied to the discovered target labels.
The page shows also the target URL generated after applying all the relabeling rules.
The `http://vmagent:8429/targets` page shows only active targets. If you need to understand why some target
is dropped during the relabeling, then navigate to `http://vmagent:8428/service-discovery` page
(`http://victoriametrics:8428/service-discovery` for single-node VictoriaMetrics), find the dropped target
and click the `debug` link there. The opened page will show step-by-step results for the actual relabeling rules,
and click the `debug` link there. The opened page shows step-by-step results for the actual relabeling rules,
which result to target drop.
- Metric-level debugging (e.g. `metric_relabel_configs` section at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs)
can be performed by navigating to `http://vmagent:8429/targets` page (`http://victoriametrics:8428/targets` page for single-node VictoriaMetrics)
and clicking the `debug metrics relabeling` link at the target, which must be debugged.
The opened page will show step-by-step results for the actual metric relabeling rules applied to the given target labels.
The opened page shows step-by-step results for the actual metric relabeling rules applied to the given target labels.
## Prometheus staleness markers
@@ -670,7 +679,7 @@ e.g. it sets `scrape_series_added` metric to zero. See [these docs](#automatical
## Stream parsing mode
By default `vmagent` reads the full response body from scrape target into memory, then parses it, applies [relabeling](#relabeling)
By default, `vmagent` reads the full response body from scrape target into memory, then parses it, applies [relabeling](#relabeling)
and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases
when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory
when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode.
@@ -698,8 +707,8 @@ scrape_configs:
stream_parse: true
static_configs:
- targets:
- big-prometeus1
- big-prometeus2
- big-prometheus1
- big-prometheus2
honor_labels: true
metrics_path: /federate
params:
@@ -707,7 +716,7 @@ scrape_configs:
```
Note that `vmagent` in stream parsing mode stores up to `sample_limit` samples to the configured `-remoteStorage.url`
instead of droping all the samples read from the target, because the parsed data is sent to the remote storage
instead of dropping all the samples read from the target, because the parsed data is sent to the remote storage
as soon as it is parsed in stream parsing mode.
## Scraping big number of targets
@@ -727,7 +736,7 @@ spread scrape targets among a cluster of two `vmagent` instances:
The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes.
The pod name must end with a number in the range `0 ... promscrape.cluster.memberNum-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
By default each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
By default, each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances:
@@ -743,14 +752,18 @@ See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
## High availability
It is possible to run multiple identically configured `vmagent` instances or `vmagent` [clusters](#scraping-big-number-of-targets),
so they [scrape](#how-to-collect-metrics-in-prometheus-format) the same set of targets and push the collected data to the same set of VictoriaMetrics remote storage systems.
It is possible to run multiple **identically configured** `vmagent` instances or `vmagent`
[clusters](#scraping-big-number-of-targets), so they [scrape](#how-to-collect-metrics-in-prometheus-format)
the same set of targets and push the collected data to the same set of VictoriaMetrics remote storage systems.
Two **identically configured** vmagent instances or clusters is usually called an HA pair.
In this case the deduplication must be configured at VictoriaMetrics in order to de-duplicate samples received from multiple identically configured `vmagent` instances or clusters.
When running HA pairs, [deduplication](https://docs.victoriametrics.com/#deduplication) must be configured
at VictoriaMetrics side in order to de-duplicate received samples.
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
It is also recommended passing different values to `-promscrape.cluster.name` command-line flag per each `vmagent` instance or per each `vmagent` cluster in HA setup.
This is needed for proper data de-duplication. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679) for details.
It is also recommended passing different values to `-promscrape.cluster.name` command-line flag per each `vmagent`
instance or per each `vmagent` cluster in HA setup. This is needed for proper data de-duplication.
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679) for details.
## Scraping targets via a proxy
@@ -793,7 +806,7 @@ scrape_configs:
## Cardinality limiter
By default `vmagent` doesn't limit the number of time series each scrape target can expose.
By default, `vmagent` doesn't limit the number of time series each scrape target can expose.
The limit can be enforced in the following places:
* Via `-promscrape.seriesLimitPerTarget` command-line option. This limit is applied individually
@@ -821,7 +834,7 @@ These metrics allow building the following alerting rules:
See also `sample_limit` option at [scrape_config section](https://docs.victoriametrics.com/sd_configs.html#scrape_configs).
By default `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`.
By default, `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`.
The limit can be enforced by setting the following command-line flags:
* `-remoteWrite.maxHourlySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last hour.
@@ -864,7 +877,7 @@ If you have suggestions for improvements or have found a bug - please open an is
* `http://vmagent-host:8429/api/v1/targets`. This handler returns JSON response
compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes
it's initialization for all the [service_discovery configs](https://docs.victoriametrics.com/sd_configs.html).
its initialization for all the [service_discovery configs](https://docs.victoriametrics.com/sd_configs.html).
It may be useful to perform `vmagent` rolling update without any scrape loss.
## Troubleshooting
@@ -892,9 +905,9 @@ If you have suggestions for improvements or have found a bug - please open an is
* The `/service-discovery` page could be useful for debugging relabeling process for scrape targets.
This page contains original labels for targets dropped during relabeling.
By default the `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling,
By default, the `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling,
then increase `-promscrape.maxDroppedTargets` command-line flag value to see all the dropped targets.
Note that tracking each dropped target requires up to 10Kb of RAM. Therefore big values for `-promscrape.maxDroppedTargets`
Note that tracking each dropped target requires up to 10Kb of RAM. Therefore, big values for `-promscrape.maxDroppedTargets`
may result in increased memory usage if a big number of scrape targets are dropped during relabeling.
* We recommend you increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported
@@ -904,7 +917,7 @@ If you have suggestions for improvements or have found a bug - please open an is
* If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set,
try increasing `-remoteWrite.queues`. Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage.
Therefore it starts dropping the buffered data if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
Therefore, it starts dropping the buffered data if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
* `vmagent` drops data blocks if remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.
The number of dropped blocks can be monitored via `vmagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring).
@@ -970,7 +983,7 @@ See also [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting
* [Writing metrics to Kafka](#writing-metrics-to-kafka)
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
in `vmutils-...-enteprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
in `vmutils-...-enterprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
### Reading metrics from Kafka
@@ -1018,7 +1031,7 @@ data_format = "influx"
These command-line flags are available only in [enterprise](https://docs.victoriametrics.com/enterprise.html) version of `vmagent`,
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
(see `vmutils-...-enteprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
(see `vmutils-...-enterprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
```
-kafka.consumer.topic array
@@ -1154,7 +1167,7 @@ It is safe sharing the collected profiles from security point of view, since the
## Advanced usage
`vmagent` can be fine-tuned with various command-line flags. Run `./vmagent -help` in order to see the full list of these flags with their desciptions and default values:
`vmagent` can be fine-tuned with various command-line flags. Run `./vmagent -help` in order to see the full list of these flags with their descriptions and default values:
```
./vmagent -help
@@ -1177,9 +1190,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-denyQueryTracing
Whether to disable the ability to trace queries. See https://docs.victoriametrics.com/#query-tracing
-dryRun
Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag
Whether to check config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig, -remoteWrite.streamAggr.config . Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -1189,7 +1202,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-graphiteListenAddr string
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty. See also -graphiteListenAddr.useProxyProtocol
-graphiteListenAddr.useProxyProtocol
@@ -1199,7 +1212,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -1236,13 +1249,13 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-influxSkipMeasurement
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
-influxSkipSingleField
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metric name if InfluxDB line contains only a single field
-influxTrimTimestamp duration
Trim timestamps for InfluxDB line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-insert.maxQueueDuration duration
The maximum duration to wait in the queue when -maxConcurrentInserts concurrent insert requests are executed (default 1m0s)
-internStringCacheExpireDuration duration
The expire duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s)
The expiry duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s)
-internStringDisableCache
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
-internStringMaxLen int
@@ -1323,9 +1336,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-promscrape.azureSDCheckInterval duration
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
-promscrape.cluster.memberNum string
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
-promscrape.cluster.name string
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
-promscrape.cluster.replicationFactor int
@@ -1337,17 +1350,19 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-promscrape.config.strictParse
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#consul_sd_configs for details (default 30s)
-promscrape.consulagentSDCheckInterval duration
Interval for checking for changes in Consul Agent. This works only if consulagent_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#consulagent_sd_configs for details (default 30s)
-promscrape.digitaloceanSDCheckInterval duration
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#digitalocean_sd_configs for details (default 1m0s)
-promscrape.disableCompression
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control
-promscrape.disableKeepAlive
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
-promscrape.discovery.concurrency int
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
-promscrape.discovery.concurrentWaitTime duration
@@ -1398,7 +1413,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-promscrape.seriesLimitPerTarget int
Optional limit on the number of unique time series a single scrape target can expose. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter for more info
-promscrape.streamParse
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is possible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is possible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine-grained control
-promscrape.suppressDuplicateScrapeTargetErrors
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
-promscrape.suppressScrapeErrors
@@ -1413,7 +1428,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.aws.accessKey array
Optional AWS AccessKey to use for the corresponding -remoteWrite.url if -remoteWrite.aws.useSigv4 is set
@@ -1474,7 +1489,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-remoteWrite.maxDailySeries int
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
-remoteWrite.maxDiskUsagePerURL array
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500MB. Disk usage is unlimited if the value is set to 0
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. Disk usage is unlimited if the value is set to 0
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB.
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.maxHourlySeries int
@@ -1505,12 +1520,14 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
-remoteWrite.queues int
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
-remoteWrite.rateLimit array
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.relabelConfig string
Optional path to file with relabeling configs, which are applied to all the metrics before sending them to -remoteWrite.url. See also -remoteWrite.urlRelabelConfig. The path can point either to local file or to http url. See https://docs.victoriametrics.com/vmagent.html#relabeling
-remoteWrite.keepDanglingQueues
Keep persistent queues contents at -remoteWrite.tmpDataPath in case there are no matching -remoteWrite.url. Useful when -remoteWrite.url is changed temporarily and persistent queue files will be needed later on.
-remoteWrite.roundDigits array
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.sendTimeout array
Timeout for sending a single block of data to the corresponding -remoteWrite.url
@@ -1527,10 +1544,10 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.streamAggr.keepInput array
Whether to keep input samples after the aggregation with -remoteWrite.streamAggr.config. By default the input is dropped after the aggregation, so only the aggregate data is sent to the -remoteWrite.url. See https://docs.victoriametrics.com/stream-aggregation.html
Whether to keep input samples after the aggregation with -remoteWrite.streamAggr.config. By default, the input is dropped after the aggregation, so only the aggregate data is sent to the -remoteWrite.url. See https://docs.victoriametrics.com/stream-aggregation.html
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to the corresponding -remoteWrite.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to the corresponding -remoteWrite.url. By default, system CA is used
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to the corresponding -remoteWrite.url
@@ -1542,7 +1559,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
Optional path to client-side TLS certificate key to use when connecting to the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsServerName array
Optional TLS server name to use for connections to the corresponding -remoteWrite.url. By default the server name from -remoteWrite.url is used
Optional TLS server name to use for connections to the corresponding -remoteWrite.url. By default, the server name from -remoteWrite.url is used
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tmpDataPath string
Path to directory where temporary data for remote write component is stored. See also -remoteWrite.maxDiskUsagePerURL (default "vmagent-remotewrite-data")

View File

@@ -22,7 +22,7 @@ import (
var (
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metric name if InfluxDB line contains only a single field")
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
dbLabel = flag.String("influxDBLabel", "db", "Default label for the DB name sent over '?db={db_name}' query parameter")
)

View File

@@ -67,8 +67,8 @@ var (
opentsdbHTTPUseProxyProtocol = flag.Bool("opentsdbHTTPListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted "+
"at -opentsdbHTTPListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt")
configAuthKey = flag.String("configAuthKey", "", "Authorization key for accessing /config page. It must be passed via authKey query arg")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . "+
dryRun = flag.Bool("dryRun", false, "Whether to check config files without running vmagent. The following files are checked: "+
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig, -remoteWrite.streamAggr.config . "+
"Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag")
)
@@ -103,11 +103,14 @@ func main() {
return
}
if *dryRun {
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
if err := remotewrite.CheckRelabelConfigs(); err != nil {
logger.Fatalf("error when checking relabel configs: %s", err)
}
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
if err := remotewrite.CheckStreamAggrConfigs(); err != nil {
logger.Fatalf("error when checking -remoteWrite.streamAggr.config: %s", err)
}
logger.Infof("all the configs are ok; exiting with 0 status code")
return

View File

@@ -27,7 +27,7 @@ var (
"to the corresponding -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol")
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
"By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
"By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
"is sent after temporary unavailability of the remote storage")
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to the corresponding -remoteWrite.url")
proxyURL = flagutil.NewArrayString("remoteWrite.proxyURL", "Optional proxy URL for writing data to the corresponding -remoteWrite.url. "+
@@ -38,9 +38,9 @@ var (
"to the corresponding -remoteWrite.url")
tlsKeyFile = flagutil.NewArrayString("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to the corresponding -remoteWrite.url")
tlsCAFile = flagutil.NewArrayString("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to the corresponding -remoteWrite.url. "+
"By default system CA is used")
"By default, system CA is used")
tlsServerName = flagutil.NewArrayString("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to the corresponding -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
"By default, the server name from -remoteWrite.url is used")
headers = flagutil.NewArrayString("remoteWrite.headers", "Optional HTTP headers to send with each request to the corresponding -remoteWrite.url. "+
"For example, -remoteWrite.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteWrite.url. "+

View File

@@ -4,17 +4,21 @@ import (
"flag"
"fmt"
"net/url"
"path/filepath"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/cespare/xxhash/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
@@ -24,7 +28,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
"github.com/VictoriaMetrics/metrics"
"github.com/cespare/xxhash/v2"
)
var (
@@ -36,20 +39,22 @@ var (
"Pass multiple -remoteWrite.multitenantURL flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.url")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+
"See also -remoteWrite.maxDiskUsagePerURL")
keepDanglingQueues = flag.Bool("remoteWrite.keepDanglingQueues", false, "Keep persistent queues contents at -remoteWrite.tmpDataPath in case there are no matching -remoteWrite.url. "+
"Useful when -remoteWrite.url is changed temporarily and persistent queue files will be needed later on.")
queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs()*2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
"isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
"Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500MB. "+
"Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. "+
"Disk usage is unlimited if the value is set to 0")
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", "The number of significant figures to leave in metric values before writing them "+
"to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. "+
"This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits")
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", "Round metric values to this number of decimal digits after the point before writing them to remote storage. "+
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
"By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
"By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
"This option may be used for improving data compression for the stored metrics")
sortLabels = flag.Bool("sortLabels", false, `Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. `+
`This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. `+
@@ -64,7 +69,7 @@ var (
"See https://docs.victoriametrics.com/stream-aggregation.html . "+
"See also -remoteWrite.streamAggr.keepInput and -remoteWrite.streamAggr.dedupInterval")
streamAggrKeepInput = flagutil.NewArrayBool("remoteWrite.streamAggr.keepInput", "Whether to keep input samples after the aggregation with -remoteWrite.streamAggr.config. "+
"By default the input is dropped after the aggregation, so only the aggregate data is sent to the -remoteWrite.url. "+
"By default, the input is dropped after the aggregation, so only the aggregate data is sent to the -remoteWrite.url. "+
"See https://docs.victoriametrics.com/stream-aggregation.html")
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", "Input samples are de-duplicated with this interval before being aggregated. "+
"Only the last sample per each time series per each interval is aggregated if the interval is greater than zero")
@@ -94,6 +99,8 @@ var allRelabelConfigs atomic.Value
// since it may lead to high memory usage due to big number of buffers.
var maxQueues = cgroup.AvailableCPUs() * 16
const persistentQueueDirname = "persistent-queue"
// InitSecretFlags must be called after flag.Parse and before any logging.
func InitSecretFlags() {
if !*showRemoteWriteURL {
@@ -150,9 +157,8 @@ func Init() {
logger.Fatalf("cannot load relabel configs: %s", err)
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
relabelConfigSuccess.Set(1)
relabelConfigTimestamp.Set(fasttime.UnixTimestamp())
if len(*remoteWriteURLs) > 0 {
rwctxsDefault = newRemoteWriteCtxs(nil, *remoteWriteURLs)
@@ -165,34 +171,56 @@ func Init() {
for {
select {
case <-sighupCh:
case <-stopCh:
case <-configReloaderStopCh:
return
}
configReloads.Inc()
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
rcs, err := loadRelabelConfigs()
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
continue
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Successfully reloaded relabel configs")
reloadRelabelConfigs()
reloadStreamAggrConfigs()
}
}()
}
func reloadRelabelConfigs() {
relabelConfigReloads.Inc()
logger.Infof("reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
rcs, err := loadRelabelConfigs()
if err != nil {
relabelConfigReloadErrors.Inc()
relabelConfigSuccess.Set(0)
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
return
}
allRelabelConfigs.Store(rcs)
relabelConfigSuccess.Set(1)
relabelConfigTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("successfully reloaded relabel configs")
}
var (
configReloads = metrics.NewCounter(`vmagent_relabel_config_reloads_total`)
configReloadErrors = metrics.NewCounter(`vmagent_relabel_config_reloads_errors_total`)
configSuccess = metrics.NewCounter(`vmagent_relabel_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vmagent_relabel_config_last_reload_success_timestamp_seconds`)
relabelConfigReloads = metrics.NewCounter(`vmagent_relabel_config_reloads_total`)
relabelConfigReloadErrors = metrics.NewCounter(`vmagent_relabel_config_reloads_errors_total`)
relabelConfigSuccess = metrics.NewCounter(`vmagent_relabel_config_last_reload_successful`)
relabelConfigTimestamp = metrics.NewCounter(`vmagent_relabel_config_last_reload_success_timestamp_seconds`)
)
func reloadStreamAggrConfigs() {
if len(*remoteWriteMultitenantURLs) > 0 {
rwctxsMapLock.Lock()
for _, rwctxs := range rwctxsMap {
reinitStreamAggr(rwctxs)
}
rwctxsMapLock.Unlock()
} else {
reinitStreamAggr(rwctxsDefault)
}
}
func reinitStreamAggr(rwctxs []*remoteWriteCtx) {
for _, rwctx := range rwctxs {
rwctx.reinitStreamAggr()
}
}
func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
if len(urls) == 0 {
logger.Panicf("BUG: urls must be non-empty")
@@ -225,17 +253,44 @@ func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
}
rwctxs[i] = newRemoteWriteCtx(i, at, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
}
if !*keepDanglingQueues {
// Remove dangling queues, if any.
// This is required for the case when the number of queues has been changed or URL have been changed.
// See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4014
existingQueues := make(map[string]struct{}, len(rwctxs))
for _, rwctx := range rwctxs {
existingQueues[rwctx.fq.Dirname()] = struct{}{}
}
queuesDir := filepath.Join(*tmpDataPath, persistentQueueDirname)
files := fs.MustReadDir(queuesDir)
removed := 0
for _, f := range files {
dirname := f.Name()
if _, ok := existingQueues[dirname]; !ok {
logger.Infof("removing dangling queue %q", dirname)
fullPath := filepath.Join(queuesDir, dirname)
fs.MustRemoveAll(fullPath)
removed++
}
}
if removed > 0 {
logger.Infof("removed %d dangling queues from %q, active queues: %d", removed, *tmpDataPath, len(rwctxs))
}
}
return rwctxs
}
var stopCh = make(chan struct{})
var configReloaderStopCh = make(chan struct{})
var configReloaderWG sync.WaitGroup
// Stop stops remotewrite.
//
// It is expected that nobody calls Push during and after the call to this func.
func Stop() {
close(stopCh)
close(configReloaderStopCh)
configReloaderWG.Wait()
for _, rwctx := range rwctxsDefault {
@@ -450,7 +505,7 @@ type remoteWriteCtx struct {
fq *persistentqueue.FastQueue
c *client
sas *streamaggr.Aggregators
sas atomic.Pointer[streamaggr.Aggregators]
streamAggrKeepInput bool
pss []*pendingSeries
@@ -466,8 +521,13 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
pqURL.RawQuery = ""
pqURL.Fragment = ""
h := xxhash.Sum64([]byte(pqURL.String()))
queuePath := fmt.Sprintf("%s/persistent-queue/%d_%016X", *tmpDataPath, argIdx+1, h)
queuePath := filepath.Join(*tmpDataPath, persistentQueueDirname, fmt.Sprintf("%d_%016X", argIdx+1, h))
maxPendingBytes := maxPendingBytesPerURL.GetOptionalArgOrDefault(argIdx, 0)
if maxPendingBytes != 0 && maxPendingBytes < persistentqueue.DefaultChunkFileSize {
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4195
logger.Warnf("rounding the -remoteWrite.maxDiskUsagePerURL=%d to the minimum supported value: %d", maxPendingBytes, persistentqueue.DefaultChunkFileSize)
maxPendingBytes = persistentqueue.DefaultChunkFileSize
}
fq := persistentqueue.MustOpenFastQueue(queuePath, sanitizedURL, maxInmemoryBlocks, maxPendingBytes)
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, queuePath, sanitizedURL), func() float64 {
return float64(fq.GetPendingBytes())
@@ -515,16 +575,23 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(argIdx, 0)
sas, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
if err != nil {
logger.Fatalf("cannot initialize stream aggregators from -remoteWrite.streamAggrFile=%q: %s", sasFile, err)
logger.Fatalf("cannot initialize stream aggregators from -remoteWrite.streamAggr.config=%q: %s", sasFile, err)
}
rwctx.sas = sas
rwctx.sas.Store(sas)
rwctx.streamAggrKeepInput = streamAggrKeepInput.GetOptionalArg(argIdx)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, sasFile)).Set(1)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, sasFile)).Set(fasttime.UnixTimestamp())
}
return rwctx
}
func (rwctx *remoteWriteCtx) MustStop() {
// sas must be stopped before rwctx is closed
// because sas can write pending series to rwctx.pss if there are any
sas := rwctx.sas.Swap(nil)
sas.MustStop()
for _, ps := range rwctx.pss {
ps.MustStop()
}
@@ -533,8 +600,7 @@ func (rwctx *remoteWriteCtx) MustStop() {
rwctx.fq.UnblockAllReaders()
rwctx.c.MustStop()
rwctx.c = nil
rwctx.sas.MustStop()
rwctx.sas = nil
rwctx.fq.MustClose()
rwctx.fq = nil
@@ -565,8 +631,9 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
rwctx.rowsPushedAfterRelabel.Add(rowsCount)
// Apply stream aggregation if any
rwctx.sas.Push(tss)
if rwctx.sas == nil || rwctx.streamAggrKeepInput {
sas := rwctx.sas.Load()
sas.Push(tss)
if sas == nil || rwctx.streamAggrKeepInput {
// Push samples to the remote storage
rwctx.pushInternal(tss)
}
@@ -585,6 +652,36 @@ func (rwctx *remoteWriteCtx) pushInternal(tss []prompbmarshal.TimeSeries) {
pss[idx].Push(tss)
}
func (rwctx *remoteWriteCtx) reinitStreamAggr() {
sas := rwctx.sas.Load()
if sas == nil {
// There is no stream aggregation for rwctx
return
}
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", sasFile)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, sasFile)).Inc()
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(rwctx.idx, 0)
sasNew, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
if err != nil {
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, sasFile)).Inc()
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, sasFile)).Set(0)
logger.Errorf("cannot reload stream aggregation config from -remoteWrite.streamAggr.config=%q; continue using the previously loaded config; error: %s", sasFile, err)
return
}
if !sasNew.Equal(sas) {
sasOld := rwctx.sas.Swap(sasNew)
sasOld.MustStop()
logger.Infof("successfully reloaded stream aggregation configs at -remoteWrite.streamAggr.config=%q", sasFile)
} else {
sasNew.MustStop()
logger.Infof("the config at -remoteWrite.streamAggr.config=%q wasn't changed", sasFile)
}
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, sasFile)).Set(1)
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, sasFile)).Set(fasttime.UnixTimestamp())
}
var tssRelabelPool = &sync.Pool{
New: func() interface{} {
a := []prompbmarshal.TimeSeries{}
@@ -599,3 +696,20 @@ func getRowsCount(tss []prompbmarshal.TimeSeries) int {
}
return rowsCount
}
// CheckStreamAggrConfigs checks configs pointed by -remoteWrite.streamAggr.config
func CheckStreamAggrConfigs() error {
pushNoop := func(tss []prompbmarshal.TimeSeries) {}
for idx, sasFile := range *streamAggrConfig {
if sasFile == "" {
continue
}
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(idx, 0)
sas, err := streamaggr.LoadFromFile(sasFile, pushNoop, dedupInterval)
if err != nil {
return fmt.Errorf("cannot load -remoteWrite.streamAggr.config=%q: %w", sasFile, err)
}
sas.MustStop()
}
return nil
}

View File

@@ -73,6 +73,7 @@ test-vmalert:
go test -v -race -cover ./app/vmalert/notifier
go test -v -race -cover ./app/vmalert/config
go test -v -race -cover ./app/vmalert/remotewrite
go test -v -race -cover ./app/vmalert/utils
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/config/testdata/rules/rules2-good.rules \

View File

@@ -14,7 +14,7 @@ or [cluster version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.ht
of VictoriaMetrics are capable of proxying requests to vmalert via `-vmalert.proxyURL` command-line flag.
Use this feature for the following cases:
* for proxying requests from [Grafana Alerting UI](https://grafana.com/docs/grafana/latest/alerting/);
* for accessing vmalert's UI through VictoriaMetrics Web interface.
* for accessing vmalerts UI through VictoriaMetrics Web interface.
## Features
@@ -23,13 +23,14 @@ Use this feature for the following cases:
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager) starting from [Alertmanager v0.16.0-aplha](https://github.com/prometheus/alertmanager/releases/tag/v0.16.0-alpha.0);
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager) starting from [Alertmanager v0.16.0-alpha](https://github.com/prometheus/alertmanager/releases/tag/v0.16.0-alpha.0);
* Keeps the alerts [state on restarts](#alerts-state-on-restarts);
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite);
* Recording and Alerting rules backfilling (aka `replay`). See [these docs](#rules-backfilling);
* Lightweight and without extra dependencies.
* Supports [reusable templates](#reusable-templates) for annotations;
* Load of recording and alerting rules from local filesystem, GCS and S3.
* Load of recording and alerting rules from local filesystem, URL, GCS and S3;
* Detect alerting rules which [don't match any series](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
## Limitations
@@ -144,6 +145,15 @@ params:
headers:
[ <string>, ...]
# Optional list of HTTP headers in form `header-name: value`
# applied for all alert notifications sent to notifiers
# generated by rules of this group.
# For example:
# notifier_headers:
# - "TenantID: foo"
notifier_headers:
[ <string>, ...]
# Optional list of labels added to every rule within a group.
# It has priority over the external labels.
# Labels are commonly used for adding environment
@@ -261,7 +271,7 @@ Additionally, `vmalert` provides some extra templating functions listed [here](#
query at `-datasource.url` and returns the first result.
- `queryEscape` - escapes the input string, so it can be safely put inside [query arg](https://en.wikipedia.org/wiki/Percent-encoding) part of URL.
- `quotesEscape` - escapes the input string, so it can be safely embedded into JSON string.
- `reReplaceAll regex repl` - replaces all the occurences of the `regex` in input string with the `repl`.
- `reReplaceAll regex repl` - replaces all the occurrences of the `regex` in input string with the `repl`.
- `safeHtml` - marks the input string as safe to use in HTML context without the need to html-escape it.
- `sortByLabel name` - sorts the input query results by the label with the given `name`.
- `stripDomain` - leaves the first part of the domain. For example, `foo.bar.baz` is converted to `foo`.
@@ -473,7 +483,7 @@ Cluster mode could have multiple `vminsert` and `vmselect` components.
<img alt="vmalert cluster" src="vmalert_cluster.png">
In case when you want to spread the load on these components - add balancers before them and configure
`vmalert` with balancer's addresses. Please, see more about VM's cluster architecture
`vmalert` with balancer addresses. Please, see more about VM's cluster architecture
[here](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#architecture-overview).
#### HA vmalert
@@ -498,7 +508,7 @@ Alertmanagers.
To avoid recording rules results and alerts state duplication in VictoriaMetrics server
don't forget to configure [deduplication](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).
The recommended value for `-dedup.minScrapeInterval` must be greater or equal to vmalert's `evaluation_interval`.
The recommended value for `-dedup.minScrapeInterval` must be greater or equal to vmalert `evaluation_interval`.
If you observe inconsistent or "jumping" values in series produced by vmalert, try disabling `-datasource.queryTimeAlignment`
command line flag. Because of alignment, two or more vmalert HA pairs will produce results with the same timestamps.
But due of backfilling (data delivered to the datasource with some delay) values of such results may differ,
@@ -508,7 +518,7 @@ Alertmanager will automatically deduplicate alerts with identical labels, so ens
all `vmalert`s are having the same config.
Don't forget to configure [cluster mode](https://prometheus.io/docs/alerting/latest/alertmanager/)
for Alertmanagers for better reliability. List all Alertmanager URLs in vmalert's `-notifier.url`
for Alertmanagers for better reliability. List all Alertmanager URLs in vmalert `-notifier.url`
to ensure [high availability](https://github.com/prometheus/alertmanager#high-availability).
This example uses single-node VM server for the sake of simplicity.
@@ -600,7 +610,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
`vmalert` web UI can be accessed from [single-node version of VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html)
and from [cluster version of VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
This may be used for better integraion with Grafana unified alerting system. See the following docs for details:
This may be used for better integration with Grafana unified alerting system. See the following docs for details:
* [How to query vmalert from single-node VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#vmalert)
* [How to query vmalert from VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmalert)
@@ -752,11 +762,11 @@ Try the following recommendations to reduce the chance of hitting the data delay
[time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution). For example,
if expression is `rate(my_metric[2m]) > 0` then ensure that `my_metric` resolution is at least `1m` or better `30s`.
If you use VictoriaMetrics as datasource, `[duration]` can be omitted and VictoriaMetrics will adjust it automatically.
* If you know in advance, that data in datasource is delayed - try changing vmalert's `-datasource.lookback`
* If you know in advance, that data in datasource is delayed - try changing vmalerts `-datasource.lookback`
command-line flag to add a time shift for evaluations. Or extend `[duration]` to tolerate the delay.
For example, `max_over_time(errors_total[10m]) > 0` will be active even if there is no data in datasource for last `9m`.
* If [time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution)
in datasource is inconsistent or `>=5min` - try changing vmalert's `-datasource.queryStep` command-line flag to specify
in datasource is inconsistent or `>=5min` - try changing vmalerts `-datasource.queryStep` command-line flag to specify
how far search query can lookback for the recent datapoint. The recommendation is to have the step
at least two times bigger than the resolution.
@@ -803,6 +813,22 @@ and vmalert will start printing additional log messages:
2022-09-15T13:36:56.153Z DEBUG rule "TestGroup":"Conns" (2601299393013563564) at 2022-09-15T15:36:56+02:00: alert 10705778000901301787 {alertgroup="TestGroup",alertname="Conns",cluster="east-1",instance="localhost:8429",replica="a"} PENDING => FIRING: 1m0s since becoming active at 2022-09-15 15:35:56.126006 +0200 CEST m=+39.384575417
```
### Never-firing alerts
vmalert can detect if alert's expression doesn't match any time series in runtime. This problem usually happens
when alerting expression selects time series which aren't present in the datasource (i.e. wrong `job` label)
or there is a typo in the series selector (i.e. `env=rpod`). Such alerting rules will be marked with special icon in
vmalerts UI and exposed via `vmalert_alerting_rules_last_evaluation_series_fetched` metric. The metric value will
show how many time series were matched before the filtering by rule's expression. If metric value is `-1`, then
this feature is not supported by the datasource (old versions of VictoriaMetrics). The following expression can be
used to detect rules matching no series:
```
max(vmalert_alerting_rules_last_evaluation_series_fetched) by(group, alertname) == 0
```
See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
This feature is available only if vmalert is using VictoriaMetrics v1.90 or higher as a datasource.
## Profiling
@@ -846,7 +872,7 @@ The shortlist of configuration flags is the following:
-clusterMode
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
-configCheckInterval duration
Interval for checking for changes in '-rule' or '-notifier.config' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes.
Interval for checking for changes in '-rule' or '-notifier.config' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.
-datasource.appendTypePrefix
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
-datasource.basicAuth.password string
@@ -880,7 +906,7 @@ The shortlist of configuration flags is the following:
-datasource.queryStep duration
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
-datasource.queryTimeAlignment
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite of number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.showURL
@@ -906,7 +932,7 @@ The shortlist of configuration flags is the following:
-dryRun
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -916,20 +942,20 @@ The shortlist of configuration flags is the following:
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
-external.alert.source string
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service. Supports templating - see https://docs.victoriametrics.com/vmalert.html#templating . For example, link to Grafana: -external.alert.source='explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":{{$expr|jsonEscape|queryEscape}} },{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]' . If empty 'vmalert/alert?group_id={{.GroupID}}&alert_id={{.AlertID}}' is used.
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service. Supports templating - see https://docs.victoriametrics.com/vmalert.html#templating . For example, link to Grafana: -external.alert.source='explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{$expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"now-1h","to":"now"}}'. Link to VMUI: -external.alert.source='vmui/#/?g0.expr={{.Expr|queryEscape}}'. If empty 'vmalert/alert?group_id={{.GroupID}}&alert_id={{.AlertID}}' is used.
-external.label array
Optional label in the form 'Name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
Supports an array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier
External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -1010,7 +1036,7 @@ The shortlist of configuration flags is the following:
-notifier.suppressDuplicateTargetErrors
Whether to suppress 'duplicate target' errors during discovery
-notifier.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default, system CA is used
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
@@ -1022,7 +1048,7 @@ The shortlist of configuration flags is the following:
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsServerName array
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
Optional TLS server name to use for connections to -notifier.url. By default, the server name from -notifier.url is used
Supports an array of values separated by comma or specified via multiple flags.
-notifier.url array
Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.
@@ -1045,7 +1071,7 @@ The shortlist of configuration flags is the following:
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
@@ -1078,7 +1104,7 @@ The shortlist of configuration flags is the following:
-remoteRead.showURL
Whether to show -remoteRead.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-remoteRead.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default, system CA is used
-remoteRead.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
-remoteRead.tlsInsecureSkipVerify
@@ -1086,7 +1112,7 @@ The shortlist of configuration flags is the following:
-remoteRead.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
-remoteRead.tlsServerName string
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
Optional TLS server name to use for connections to -remoteRead.url. By default, the server name from -remoteRead.url is used
-remoteRead.url vmalert
Optional URL to datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect.Remote read is used to restore alerts state.This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428. See also '-remoteRead.disablePathAppend', '-remoteRead.showURL'.
-remoteWrite.basicAuth.password string
@@ -1108,7 +1134,7 @@ The shortlist of configuration flags is the following:
-remoteWrite.headers string
Optional HTTP headers to send with each request to the corresponding -remoteWrite.url. For example, -remoteWrite.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteWrite.url. Multiple headers must be delimited by '^^': -remoteWrite.headers='header1:value1^^header2:value2'
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
Defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.oauth2.clientID string
@@ -1121,12 +1147,16 @@ The shortlist of configuration flags is the following:
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
-remoteWrite.oauth2.tokenUrl string
Optional OAuth2 tokenURL to use for -notifier.url.
-remoteWrite.retryMaxTime duration
The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 30s)
-remoteWrite.retryMinInterval duration
The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval (default 1s)
-remoteWrite.sendTimeout duration
Timeout for sending data to the configured -remoteWrite.url. (default 30s)
-remoteWrite.showURL
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
-remoteWrite.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default, system CA is used
-remoteWrite.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
-remoteWrite.tlsInsecureSkipVerify
@@ -1134,7 +1164,7 @@ The shortlist of configuration flags is the following:
-remoteWrite.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
-remoteWrite.tlsServerName string
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
Optional TLS server name to use for connections to -remoteWrite.url. By default, the server name from -remoteWrite.url is used
-remoteWrite.url string
Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend, '-remoteWrite.showURL'.
-replay.disableProgressBar
@@ -1144,7 +1174,7 @@ The shortlist of configuration flags is the following:
-replay.ruleRetryAttempts int
Defines how many retries to make before giving up on rule if request for it returns an error. (default 5)
-replay.rulesDelay duration
Delay between rules evaluation within the group. Could be important if there are chained rules inside of the groupand processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule.Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
Delay between rules evaluation within the group. Could be important if there are chained rules inside the group and processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule.Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
-replay.timeFrom string
The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'
-replay.timeTo string
@@ -1153,8 +1183,10 @@ The shortlist of configuration flags is the following:
Path to the files with alerting and/or recording rules.
Supports hierarchical patterns and regexpes.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml" -rule="gcs://vmalert-rules/tenant_%{TENANT_ID}/prod".
-rule="/path/to/file". Path to a single file with alerting rules.
-rule="http://<some-server-addr>/path/to/rules". HTTP URL to a page with alerting rules.
-rule="dir/*.yaml" -rule="/*.yaml" -rule="gcs://vmalert-rules/tenant_%{TENANT_ID}/prod".
-rule="dir/**/*.yaml". Includes all the .yaml files in "dir" subfolders recursively.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Enterprise version of vmalert supports S3 and GCS paths to rules.
@@ -1165,7 +1197,7 @@ The shortlist of configuration flags is the following:
Supports an array of values separated by comma or specified via multiple flags.
-rule.configCheckInterval duration
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
Interval for checking for changes in '-rule' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
-rule.maxResolveDuration duration
Limits the maximum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group.
-rule.resendDelay duration
@@ -1177,9 +1209,10 @@ The shortlist of configuration flags is the following:
-rule.templates="/path/to/file". Path to a single file with go templates
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
absolute path to all .tpl files in root.
-rule.templates="dir/**/*.tpl". Includes all the .tpl files in "dir" subfolders recursively.
Supports an array of values separated by comma or specified via multiple flags.
-rule.updateEntriesLimit int
Defines the max number of rule's state updates stored in-memory. Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overriden per rule via update_entries_limit param. (default 20)
Defines the max number of rule's state updates stored in-memory. Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param. (default 20)
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates

View File

@@ -47,10 +47,11 @@ type AlertingRule struct {
}
type alertingRuleMetrics struct {
errors *utils.Gauge
pending *utils.Gauge
active *utils.Gauge
samples *utils.Gauge
errors *utils.Gauge
pending *utils.Gauge
active *utils.Gauge
samples *utils.Gauge
seriesFetched *utils.Gauge
}
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
@@ -121,6 +122,21 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
e := ar.state.getLast()
return float64(e.samples)
})
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.seriesFetched == nil {
// means seriesFetched is unsupported
return -1
}
seriesFetched := float64(*e.seriesFetched)
if seriesFetched == 0 && e.samples > 0 {
// `alert: 0.95` will fetch no series
// but will get one time series in response.
seriesFetched = float64(e.samples)
}
return seriesFetched
})
return ar
}
@@ -130,6 +146,7 @@ func (ar *AlertingRule) Close() {
ar.metrics.pending.Unregister()
ar.metrics.errors.Unregister()
ar.metrics.samples.Unregister()
ar.metrics.seriesFetched.Unregister()
}
// String implements Stringer interface
@@ -234,7 +251,7 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
// to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
series, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil {
return nil, err
}
@@ -242,7 +259,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
qFn := func(query string) ([]datasource.Metric, error) {
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
}
for _, s := range series {
for _, s := range res.Data {
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
if err != nil {
return nil, fmt.Errorf("failed to create alert: %s", err)
@@ -282,14 +299,15 @@ const resolvedRetention = 15 * time.Minute
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
qMetrics, req, err := ar.q.Query(ctx, ar.Expr, ts)
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(qMetrics),
err: err,
curl: requestToCurl(req),
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
err: err,
curl: requestToCurl(req),
}
defer func() {
@@ -315,11 +333,11 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
qFn := func(query string) ([]datasource.Metric, error) {
res, _, err := ar.q.Query(ctx, query, ts)
return res, err
return res.Data, err
}
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
for _, m := range res.Data {
ls, err := ar.toLabels(m, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to expand labels: %s", err)
@@ -485,22 +503,23 @@ func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
@@ -637,11 +656,12 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts ti
ar.logDebugf(ts, nil, "restoring alert state via query %q", expr)
qMetrics, _, err := q.Query(ctx, expr, ts)
res, _, err := q.Query(ctx, expr, ts)
if err != nil {
return err
}
qMetrics := res.Data
if len(qMetrics) < 1 {
ar.logDebugf(ts, nil, "no response was received from restore query")
continue

View File

@@ -10,9 +10,9 @@ import (
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config/log"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
@@ -36,7 +36,8 @@ type Group struct {
Params url.Values `yaml:"params"`
// Headers contains optional HTTP headers added to each rule request
Headers []Header `yaml:"headers,omitempty"`
// NotifierHeaders contains optional HTTP headers sent to notifiers for generated notifications
NotifierHeaders []Header `yaml:"notifier_headers,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
@@ -199,9 +200,17 @@ func (r *Rule) Validate() error {
// ValidateTplFn must validate the given annotations
type ValidateTplFn func(annotations map[string]string) error
// cLogger is a logger with support of logs suppressing.
// it is used when logs emitted by config package needs
// to be suppressed.
var cLogger = &log.Logger{}
// ParseSilent parses rule configs from given file patterns without emitting logs
func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExpressions bool) ([]Group, error) {
files, err := readFromFS(pathPatterns, true)
cLogger.Suppress(true)
defer cLogger.Suppress(false)
files, err := readFromFS(pathPatterns)
if err != nil {
return nil, fmt.Errorf("failed to read from the config: %s", err)
}
@@ -210,7 +219,7 @@ func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExp
// Parse parses rule configs from given file patterns
func Parse(pathPatterns []string, validateTplFn ValidateTplFn, validateExpressions bool) ([]Group, error) {
files, err := readFromFS(pathPatterns, false)
files, err := readFromFS(pathPatterns)
if err != nil {
return nil, fmt.Errorf("failed to read from the config: %s", err)
}
@@ -219,7 +228,7 @@ func Parse(pathPatterns []string, validateTplFn ValidateTplFn, validateExpressio
return nil, fmt.Errorf("failed to parse %s: %s", pathPatterns, err)
}
if len(groups) < 1 {
logger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))
cLogger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}

View File

@@ -1,6 +1,8 @@
package config
import (
"net/http"
"net/http/httptest"
"net/url"
"os"
"strings"
@@ -27,6 +29,40 @@ func TestParseGood(t *testing.T) {
}
}
func TestParseFromURL(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/bad", func(w http.ResponseWriter, _ *http.Request) {
w.Write([]byte("foo bar"))
})
mux.HandleFunc("/good-alert", func(w http.ResponseWriter, _ *http.Request) {
w.Write([]byte(`
groups:
- name: TestGroup
rules:
- alert: Conns
expr: vm_tcplistener_conns > 0`))
})
mux.HandleFunc("/good-rr", func(w http.ResponseWriter, _ *http.Request) {
w.Write([]byte(`
groups:
- name: TestGroup
rules:
- record: conns
expr: max(vm_tcplistener_conns)`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
if _, err := Parse([]string{srv.URL + "/good-alert", srv.URL + "/good-rr"}, notifier.ValidateTemplates, true); err != nil {
t.Errorf("error parsing URLs %s", err)
}
if _, err := Parse([]string{srv.URL + "/bad"}, notifier.ValidateTemplates, true); err == nil {
t.Errorf("expected parsing error: %s", err)
}
}
func TestParseBad(t *testing.T) {
testCases := []struct {
path []string
@@ -64,6 +100,10 @@ func TestParseBad(t *testing.T) {
[]string{"testdata/dir/rules6-bad.rules"},
"missing ':' in header",
},
{
[]string{"http://unreachable-url"},
"failed to read",
},
}
for _, tc := range testCases {
_, err := Parse(tc.path, notifier.ValidateTemplates, true)
@@ -102,7 +142,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "group name must be set",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{
Record: "record",
@@ -113,7 +154,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{
Record: "record",
@@ -125,7 +167,8 @@ func TestGroup_Validate(t *testing.T) {
validateExpressions: true,
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{
Alert: "alert",
@@ -139,7 +182,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{
Alert: "alert",
@@ -156,7 +200,8 @@ func TestGroup_Validate(t *testing.T) {
validateAnnotations: true,
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{
Alert: "alert",
@@ -171,7 +216,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "duplicate",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
@@ -184,7 +230,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "duplicate",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{Record: "record", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
@@ -197,7 +244,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "duplicate",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
@@ -210,7 +258,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "",
},
{
group: &Group{Name: "test",
group: &Group{
Name: "test",
Rules: []Rule{
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
@@ -223,7 +272,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "",
},
{
group: &Group{Name: "test thanos",
group: &Group{
Name: "test thanos",
Type: NewRawType("thanos"),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
@@ -235,7 +285,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "unknown datasource type",
},
{
group: &Group{Name: "test graphite",
group: &Group{
Name: "test graphite",
Type: NewGraphiteType(),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
@@ -247,7 +298,8 @@ func TestGroup_Validate(t *testing.T) {
expErr: "",
},
{
group: &Group{Name: "test prometheus",
group: &Group{
Name: "test prometheus",
Type: NewPrometheusType(),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
@@ -538,6 +590,24 @@ rules:
`)
})
t.Run("`notifier_headers` change", func(t *testing.T) {
f(t, `
name: TestGroup
notifier_headers:
- "TenantID: foo"
rules:
- alert: foo
expr: sum by(job) (up == 1)
`, `
name: TestGroup
notifier_headers:
- "TenantID: bar"
rules:
- alert: foo
expr: sum by(job) (up == 1)
`)
})
t.Run("`debug` change", func(t *testing.T) {
f(t, `
name: TestGroup

View File

@@ -4,9 +4,10 @@ import (
"fmt"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config/fslocal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config/fsurl"
)
// FS represent a file system abstract for reading files.
@@ -32,18 +33,16 @@ var (
)
// readFromFS parses the given path list and inits FS for each item.
// Once inited, readFromFS will try to read and return files from each FS.
// Once initialed, readFromFS will try to read and return files from each FS.
// readFromFS returns an error if at least one FS failed to init.
// The function can be called multiple times but each unique path
// will be inited only once.
// If silent == true, readFromFS will not emit any logs.
// will be initialed only once.
//
// It is allowed to mix different FS types in path list.
func readFromFS(paths []string, silent bool) (map[string][]byte, error) {
func readFromFS(paths []string) (map[string][]byte, error) {
var err error
result := make(map[string][]byte)
for _, path := range paths {
fsRegistryMu.Lock()
fs, ok := fsRegistry[path]
if !ok {
@@ -65,18 +64,19 @@ func readFromFS(paths []string, silent bool) (map[string][]byte, error) {
return nil, fmt.Errorf("failed to list files from %q", fs)
}
if !silent {
logger.Infof("found %d files to read from %q", len(list), fs)
}
cLogger.Infof("found %d files to read from %q", len(list), fs)
if len(list) < 1 {
continue
}
ts := time.Now()
files, err := fs.Read(list)
if err != nil {
return nil, fmt.Errorf("error while reading files from %q: %w", fs, err)
}
cLogger.Infof("finished reading %d files in %v from %q", len(list), time.Since(ts), fs)
for k, v := range files {
if _, ok := result[k]; ok {
return nil, fmt.Errorf("duplicate found for file name %q: file names must be unique", k)
@@ -89,8 +89,9 @@ func readFromFS(paths []string, silent bool) (map[string][]byte, error) {
// newFS creates FS based on the give path.
// Supported file systems are: fs
func newFS(path string) (FS, error) {
func newFS(originPath string) (FS, error) {
scheme := "fs"
path := originPath
n := strings.Index(path, "://")
if n >= 0 {
scheme = path[:n]
@@ -102,6 +103,8 @@ func newFS(path string) (FS, error) {
switch scheme {
case "fs":
return &fslocal.FS{Pattern: path}, nil
case "http", "https":
return &fsurl.FS{Path: originPath}, nil
default:
return nil, fmt.Errorf("unsupported scheme %q", scheme)
}

View File

@@ -3,7 +3,8 @@ package fslocal
import (
"fmt"
"os"
"path/filepath"
"github.com/bmatcuk/doublestar/v4"
)
// FS represents a local file system
@@ -16,7 +17,7 @@ type FS struct {
// Init verifies that configured Pattern is correct
func (fs *FS) Init() error {
_, err := filepath.Glob(fs.Pattern)
_, err := doublestar.FilepathGlob(fs.Pattern)
return err
}
@@ -27,7 +28,7 @@ func (fs *FS) String() string {
// List returns the list of file names which will be read via Read fn
func (fs *FS) List() ([]string, error) {
matches, err := filepath.Glob(fs.Pattern)
matches, err := doublestar.FilepathGlob(fs.Pattern)
if err != nil {
return nil, fmt.Errorf("error while matching files via pattern %s: %w", fs.Pattern, err)
}

View File

@@ -0,0 +1,57 @@
package fsurl
import (
"fmt"
"io"
"net/http"
"net/url"
)
// FS represents a struct which can read content from URL Path
type FS struct {
// Path defines the URL to read the data from
Path string
}
// Init verifies that configured Path is correct
func (fs *FS) Init() error {
_, err := url.Parse(fs.Path)
return err
}
// String implements Stringer interface
func (fs *FS) String() string {
return fmt.Sprintf("URL {Path: %q}", fs.Path)
}
// List returns the list of file names which will be read via Read fn
// List isn't supported by FS and reads from Path only
func (fs *FS) List() ([]string, error) {
return []string{fs.Path}, nil
}
// Read returns a map of read files where
// key is the file name and value is file's content.
func (fs *FS) Read(files []string) (map[string][]byte, error) {
result := make(map[string][]byte)
for _, path := range files {
resp, err := http.Get(path)
if err != nil {
return nil, fmt.Errorf("failed to read from %q: %w", path, err)
}
data, err := io.ReadAll(resp.Body)
_ = resp.Body.Close()
if resp.StatusCode != http.StatusOK {
if len(data) > 4*1024 {
data = data[:4*1024]
}
return nil, fmt.Errorf("unexpected status code when fetching %q: %d, expecting %d; response: %q",
path, resp.StatusCode, http.StatusOK, data)
}
if err != nil {
return nil, fmt.Errorf("cannot read %q: %s", path, err)
}
result[path] = data
}
return result, nil
}

View File

@@ -0,0 +1,59 @@
package log
import (
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// Logger is using lib/logger for logging
// but can be suppressed via Suppress method
type Logger struct {
mu sync.RWMutex
disabled bool
}
// Suppress whether to ignore message logging.
// Once suppressed, logging continues to be ignored
// until logger is un-suppressed.
func (l *Logger) Suppress(v bool) {
l.mu.Lock()
l.disabled = v
l.mu.Unlock()
}
func (l *Logger) isDisabled() bool {
l.mu.RLock()
defer l.mu.RUnlock()
return l.disabled
}
// Errorf logs error message.
func (l *Logger) Errorf(format string, args ...interface{}) {
if l.isDisabled() {
return
}
logger.Errorf(format, args...)
}
// Warnf logs warning message.
func (l *Logger) Warnf(format string, args ...interface{}) {
if l.isDisabled() {
return
}
logger.Warnf(format, args...)
}
// Infof logs info message.
func (l *Logger) Infof(format string, args ...interface{}) {
if l.isDisabled() {
return
}
logger.Infof(format, args...)
}
// Panicf logs panic message and panics.
// Panicf can't be suppressed
func (l *Logger) Panicf(format string, args ...interface{}) {
logger.Panicf(format, args...)
}

View File

@@ -0,0 +1,54 @@
package log
import (
"bytes"
"fmt"
"strings"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
func TestOutput(t *testing.T) {
testOutput := &bytes.Buffer{}
logger.SetOutputForTests(testOutput)
defer logger.ResetOutputForTest()
log := &Logger{}
mustMatch := func(exp string) {
t.Helper()
if exp == "" {
if testOutput.String() != "" {
t.Errorf("expected output to be empty; got %q", testOutput.String())
return
}
}
if !strings.Contains(testOutput.String(), exp) {
t.Errorf("output %q should contain %q", testOutput.String(), exp)
}
fmt.Println(testOutput.String())
testOutput.Reset()
}
log.Warnf("foo")
mustMatch("foo")
log.Infof("info %d", 2)
mustMatch("info 2")
log.Errorf("error %s %d", "baz", 5)
mustMatch("error baz 5")
log.Suppress(true)
log.Warnf("foo")
mustMatch("")
log.Infof("info %d", 2)
mustMatch("")
log.Errorf("error %q %d", "baz", 5)
mustMatch("")
}

View File

@@ -5,6 +5,8 @@ groups:
limit: 1000
headers:
- "MyHeader: foo"
notifier_headers:
- "MyHeader: foo"
params:
denyPartialResponse: ["true"]
rules:

View File

@@ -13,11 +13,22 @@ type Querier interface {
// It returns list of Metric in response, the http.Request used for sending query
// and error if any. Returned http.Request can't be reused and its body is already read.
// Query should stop once ctx is cancelled.
Query(ctx context.Context, query string, ts time.Time) ([]Metric, *http.Request, error)
Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error)
// QueryRange executes range request with the given query on the given time range.
// It returns list of Metric in response and error if any.
// QueryRange should stop once ctx is cancelled.
QueryRange(ctx context.Context, query string, from, to time.Time) ([]Metric, error)
QueryRange(ctx context.Context, query string, from, to time.Time) (Result, error)
}
// Result represents expected response from the datasource
type Result struct {
// Data contains list of received Metric
Data []Metric
// SeriesFetched contains amount of time series processed by datasource
// during query evaluation.
// If nil, then this feature is not supported by the datasource.
// SeriesFetched is supported by VictoriaMetrics since v1.90.
SeriesFetched *int
}
// QuerierBuilder builds Querier with given params.

View File

@@ -47,7 +47,7 @@ var (
"For example, if -datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query. "+
"If set to 0, rule's evaluation interval will be used instead.")
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Whether to align "time" parameter with evaluation interval.`+
"Alignment supposed to produce deterministic results despite of number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
disableKeepAlive = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)

View File

@@ -2,6 +2,7 @@ package datasource
import (
"context"
"errors"
"fmt"
"io"
"net/http"
@@ -28,6 +29,7 @@ func toDatasourceType(s string) datasourceType {
}
// VMStorage represents vmstorage entity with ability to read and write metrics
// WARN: when adding a new field, remember to update Clone() method.
type VMStorage struct {
c *http.Client
authCfg *promauth.Config
@@ -53,29 +55,54 @@ type keyValue struct {
// Clone makes clone of VMStorage, shares http client.
func (s *VMStorage) Clone() *VMStorage {
return &VMStorage{
ns := &VMStorage{
c: s.c,
authCfg: s.authCfg,
datasourceURL: s.datasourceURL,
appendTypePrefix: s.appendTypePrefix,
lookBack: s.lookBack,
queryStep: s.queryStep,
appendTypePrefix: s.appendTypePrefix,
dataSourceType: s.dataSourceType,
dataSourceType: s.dataSourceType,
evaluationInterval: s.evaluationInterval,
// init map so it can be populated below
extraParams: url.Values{},
debug: s.debug,
}
if len(s.extraHeaders) > 0 {
ns.extraHeaders = make([]keyValue, len(s.extraHeaders))
copy(ns.extraHeaders, s.extraHeaders)
}
for k, v := range s.extraParams {
ns.extraParams[k] = v
}
return ns
}
// ApplyParams - changes given querier params.
func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
s.dataSourceType = toDatasourceType(params.DataSourceType)
s.evaluationInterval = params.EvaluationInterval
s.extraParams = params.QueryParams
s.debug = params.Debug
if params.QueryParams != nil {
if s.extraParams == nil {
s.extraParams = url.Values{}
}
for k, vl := range params.QueryParams {
for _, v := range vl { // custom query params are prior to default ones
s.extraParams.Set(k, v)
}
}
}
if params.Headers != nil {
for key, value := range params.Headers {
kv := keyValue{key: key, value: value}
s.extraHeaders = append(s.extraHeaders, kv)
}
}
s.debug = params.Debug
return s
}
@@ -94,14 +121,15 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati
lookBack: lookBack,
queryStep: queryStep,
dataSourceType: datasourcePrometheus,
extraParams: url.Values{},
}
}
// Query executes the given query and returns parsed response
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) ([]Metric, *http.Request, error) {
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error) {
req, err := s.newRequestPOST()
if err != nil {
return nil, nil, err
return Result{}, nil, err
}
switch s.dataSourceType {
@@ -110,12 +138,12 @@ func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) ([]Me
case datasourceGraphite:
s.setGraphiteReqParams(req, query, ts)
default:
return nil, nil, fmt.Errorf("engine not found: %q", s.dataSourceType)
return Result{}, nil, fmt.Errorf("engine not found: %q", s.dataSourceType)
}
resp, err := s.do(ctx, req)
if err != nil {
return nil, req, err
return Result{}, req, err
}
defer func() {
_ = resp.Body.Close()
@@ -132,24 +160,24 @@ func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) ([]Me
// QueryRange executes the given query on the given time range.
// For Prometheus type see https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries
// Graphite type isn't supported.
func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end time.Time) ([]Metric, error) {
func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end time.Time) (res Result, err error) {
if s.dataSourceType != datasourcePrometheus {
return nil, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType)
return res, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType)
}
req, err := s.newRequestPOST()
if err != nil {
return nil, err
return res, err
}
if start.IsZero() {
return nil, fmt.Errorf("start param is missing")
return res, fmt.Errorf("start param is missing")
}
if end.IsZero() {
return nil, fmt.Errorf("end param is missing")
return res, fmt.Errorf("end param is missing")
}
s.setPrometheusRangeReqParams(req, query, start, end)
resp, err := s.do(ctx, req)
if err != nil {
return nil, err
return res, err
}
defer func() {
_ = resp.Body.Close()
@@ -162,6 +190,11 @@ func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response,
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery)
}
resp, err := s.c.Do(req.WithContext(ctx))
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
resp, err = s.c.Do(req.WithContext(ctx))
}
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)
}

View File

@@ -35,12 +35,12 @@ func (r graphiteResponse) metrics() []Metric {
return ms
}
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
func parseGraphiteResponse(req *http.Request, resp *http.Response) (Result, error) {
r := &graphiteResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL.Redacted(), err)
return Result{}, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL.Redacted(), err)
}
return r.metrics(), nil
return Result{Data: r.metrics()}, nil
}
const (

View File

@@ -22,6 +22,10 @@ type promResponse struct {
ResultType string `json:"resultType"`
Result json.RawMessage `json:"result"`
} `json:"data"`
// Stats supported by VictoriaMetrics since v1.90
Stats struct {
SeriesFetched *string `json:"seriesFetched,omitempty"`
} `json:"stats,omitempty"`
}
type promInstant struct {
@@ -96,39 +100,54 @@ const (
rtVector, rtMatrix, rScalar = "vector", "matrix", "scalar"
)
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
func parsePrometheusResponse(req *http.Request, resp *http.Response) (res Result, err error) {
r := &promResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL.Redacted(), err)
if err = json.NewDecoder(resp.Body).Decode(r); err != nil {
return res, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL.Redacted(), err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL.Redacted(), r.ErrorType, r.Error)
return res, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL.Redacted(), r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
return res, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
var parseFn func() ([]Metric, error)
switch r.Data.ResultType {
case rtVector:
var pi promInstant
if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
return nil, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
return res, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
}
return pi.metrics()
parseFn = pi.metrics
case rtMatrix:
var pr promRange
if err := json.Unmarshal(r.Data.Result, &pr.Result); err != nil {
return nil, err
return res, err
}
return pr.metrics()
parseFn = pr.metrics
case rScalar:
var ps promScalar
if err := json.Unmarshal(r.Data.Result, &ps); err != nil {
return nil, err
return res, err
}
return ps.metrics()
parseFn = ps.metrics
default:
return nil, fmt.Errorf("unknown result type %q", r.Data.ResultType)
return res, fmt.Errorf("unknown result type %q", r.Data.ResultType)
}
ms, err := parseFn()
if err != nil {
return res, err
}
res = Result{Data: ms}
if r.Stats.SeriesFetched != nil {
intV, err := strconv.Atoi(*r.Stats.SeriesFetched)
if err != nil {
return res, fmt.Errorf("failed to convert stats.seriesFetched to int: %w", err)
}
res.SeriesFetched = &intV
}
return res, nil
}
func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string, timestamp time.Time) {

View File

@@ -35,13 +35,6 @@ func TestVMInstantQuery(t *testing.T) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/render", func(w http.ResponseWriter, request *http.Request) {
c++
switch c {
case 8:
w.Write([]byte(`[{"target":"constantLine(10)","tags":{"name":"constantLine(10)"},"datapoints":[[10,1611758343],[10,1611758373],[10,1611758403]]}]`))
}
})
mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
@@ -62,22 +55,28 @@ func TestVMInstantQuery(t *testing.T) {
}
switch c {
case 0:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 1:
w.WriteHeader(500)
case 2:
case 1:
w.Write([]byte("[]"))
case 3:
case 2:
w.Write([]byte(`{"status":"error", "errorType":"type:", "error":"some error msg"}`))
case 4:
case 3:
w.Write([]byte(`{"status":"unknown"}`))
case 5:
case 4:
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
case 6:
case 5:
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows","foo":"bar"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests","foo":"baz"},"value":[1583786140,"2000"]}]}}`))
case 7:
case 6:
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "1"]}}`))
case 7:
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "1"]},"stats":{"seriesFetched": "42"}}`))
}
})
mux.HandleFunc("/render", func(w http.ResponseWriter, request *http.Request) {
c++
switch c {
case 8:
w.Write([]byte(`[{"target":"constantLine(10)","tags":{"name":"constantLine(10)"},"datapoints":[[10,1611758343],[10,1611758373],[10,1611758403]]}]`))
}
})
@@ -95,24 +94,27 @@ func TestVMInstantQuery(t *testing.T) {
ts := time.Now()
expErr := func(err string) {
if _, _, err := pq.Query(ctx, query, ts); err == nil {
_, _, gotErr := pq.Query(ctx, query, ts)
if gotErr == nil {
t.Fatalf("expected %q got nil", err)
}
if !strings.Contains(gotErr.Error(), err) {
t.Fatalf("expected err %q; got %q", err, gotErr)
}
}
expErr("connection error") // 0
expErr("invalid response status error") // 1
expErr("response body error") // 2
expErr("error status") // 3
expErr("unknown status") // 4
expErr("non-vector resultType error") // 5
expErr("500") // 0
expErr("error parsing prometheus metrics") // 1
expErr("response error") // 2
expErr("unknown status") // 3
expErr("unexpected end of JSON input") // 4
m, _, err := pq.Query(ctx, query, ts) // 6 - vector
res, _, err := pq.Query(ctx, query, ts) // 5 - vector
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 2 {
t.Fatalf("expected 2 metrics got %d in %+v", len(m), m)
if len(res.Data) != 2 {
t.Fatalf("expected 2 metrics got %d in %+v", len(res.Data), res.Data)
}
expected := []Metric{
{
@@ -126,17 +128,17 @@ func TestVMInstantQuery(t *testing.T) {
Values: []float64{2000},
},
}
metricsEqual(t, m, expected)
metricsEqual(t, res.Data, expected)
m, req, err := pq.Query(ctx, query, ts) // 7 - scalar
res, req, err := pq.Query(ctx, query, ts) // 6 - scalar
if err != nil {
t.Fatalf("unexpected %s", err)
}
if req == nil {
t.Fatalf("expected request to be non-nil")
}
if len(m) != 1 {
t.Fatalf("expected 1 metrics got %d in %+v", len(m), m)
if len(res.Data) != 1 {
t.Fatalf("expected 1 metrics got %d in %+v", len(res.Data), res.Data)
}
expected = []Metric{
{
@@ -144,18 +146,44 @@ func TestVMInstantQuery(t *testing.T) {
Values: []float64{1},
},
}
if !reflect.DeepEqual(m, expected) {
t.Fatalf("unexpected metric %+v want %+v", m, expected)
if !reflect.DeepEqual(res.Data, expected) {
t.Fatalf("unexpected metric %+v want %+v", res.Data, expected)
}
if res.SeriesFetched != nil {
t.Fatalf("expected `seriesFetched` field to be nil when it is missing in datasource response; got %v instead",
res.SeriesFetched)
}
res, _, err = pq.Query(ctx, query, ts) // 7 - scalar with stats
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(res.Data) != 1 {
t.Fatalf("expected 1 metrics got %d in %+v", len(res.Data), res)
}
expected = []Metric{
{
Timestamps: []int64{1583786142},
Values: []float64{1},
},
}
if !reflect.DeepEqual(res.Data, expected) {
t.Fatalf("unexpected metric %+v want %+v", res.Data, expected)
}
if *res.SeriesFetched != 42 {
t.Fatalf("expected `seriesFetched` field to be 42; got %d instead",
*res.SeriesFetched)
}
gq := s.BuildWithParams(QuerierParams{DataSourceType: string(datasourceGraphite)})
m, _, err = gq.Query(ctx, queryRender, ts) // 8 - graphite
res, _, err = gq.Query(ctx, queryRender, ts) // 8 - graphite
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
if len(res.Data) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(res.Data), res.Data)
}
exp := []Metric{
{
@@ -164,7 +192,77 @@ func TestVMInstantQuery(t *testing.T) {
Values: []float64{10},
},
}
metricsEqual(t, m, exp)
metricsEqual(t, res.Data, exp)
}
func TestVMInstantQueryWithRetry(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
c++
if r.URL.Query().Get("query") != query {
t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query"))
}
switch c {
case 0:
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "1"]}}`))
case 1:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 2:
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "2"]}}`))
case 3:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 4:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
s := NewVMStorage(srv.URL, nil, time.Minute, 0, false, srv.Client())
pq := s.BuildWithParams(QuerierParams{DataSourceType: string(datasourcePrometheus)})
expErr := func(err string) {
_, _, gotErr := pq.Query(ctx, query, time.Now())
if gotErr == nil {
t.Fatalf("expected %q got nil", err)
}
if !strings.Contains(gotErr.Error(), err) {
t.Fatalf("expected err %q; got %q", err, gotErr)
}
}
expValue := func(v float64) {
res, _, err := pq.Query(ctx, query, time.Now())
if err != nil {
t.Fatalf("unexpected %s", err)
}
m := res.Data
if len(m) != 1 {
t.Fatalf("expected 1 metrics got %d in %+v", len(m), m)
}
expected := []Metric{
{
Timestamps: []int64{1583786142},
Values: []float64{v},
},
}
if !reflect.DeepEqual(m, expected) {
t.Fatalf("unexpected metric %+v want %+v", m, expected)
}
}
expValue(1) // 0
expValue(2) // 1 - fail, 2 - retry
expErr("EOF") // 3, 4 - retries
}
func metricsEqual(t *testing.T, gotM, expectedM []Metric) {
@@ -250,10 +348,11 @@ func TestVMRangeQuery(t *testing.T) {
start, end := time.Now().Add(-time.Minute), time.Now()
m, err := pq.QueryRange(ctx, query, start, end)
res, err := pq.QueryRange(ctx, query, start, end)
if err != nil {
t.Fatalf("unexpected %s", err)
}
m := res.Data
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
@@ -279,6 +378,9 @@ func TestRequestParams(t *testing.T) {
}
query := "up"
timestamp := time.Date(2001, 2, 3, 4, 5, 6, 0, time.UTC)
storage := VMStorage{
extraParams: url.Values{"round_digits": {"10"}},
}
testCases := []struct {
name string
queryRange bool
@@ -475,6 +577,17 @@ func TestRequestParams(t *testing.T) {
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"custom params overrides the original params",
false,
storage.Clone().ApplyParams(QuerierParams{
QueryParams: url.Values{"round_digits": {"2"}},
}),
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("query=%s&round_digits=2&time=%d", query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"graphite extra params",
false,

View File

@@ -36,9 +36,10 @@ type Group struct {
Checksum string
LastEvaluation time.Time
Labels map[string]string
Params url.Values
Headers map[string]string
Labels map[string]string
Params url.Values
Headers map[string]string
NotifierHeaders map[string]string
doneCh chan struct{}
finishedCh chan struct{}
@@ -93,16 +94,17 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval.Duration(),
Limit: cfg.Limit,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
Params: cfg.Params,
Headers: make(map[string]string),
Labels: cfg.Labels,
Type: cfg.Type,
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval.Duration(),
Limit: cfg.Limit,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
Params: cfg.Params,
Headers: make(map[string]string),
NotifierHeaders: make(map[string]string),
Labels: cfg.Labels,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
@@ -117,6 +119,9 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
for _, h := range cfg.Headers {
g.Headers[h.Key] = h.Value
}
for _, h := range cfg.NotifierHeaders {
g.NotifierHeaders[h.Key] = h.Value
}
g.metrics = newGroupMetrics(g)
rules := make([]Rule, len(cfg.Rules))
for i, r := range cfg.Rules {
@@ -230,6 +235,7 @@ func (g *Group) updateWith(newGroup *Group) error {
g.Concurrency = newGroup.Concurrency
g.Params = newGroup.Params
g.Headers = newGroup.Headers
g.NotifierHeaders = newGroup.NotifierHeaders
g.Labels = newGroup.Labels
g.Limit = newGroup.Limit
g.Checksum = newGroup.Checksum
@@ -294,7 +300,9 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
e := &executor{
rw: rw,
notifiers: nts,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label)}
notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
evalTS := time.Now()
@@ -370,6 +378,8 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
// ensure that staleness is tracked or existing rules only
e.purgeStaleSeries(g.Rules)
e.notifierHeaders = g.NotifierHeaders
if g.Interval != ng.Interval {
g.Interval = ng.Interval
t.Stop()
@@ -403,8 +413,10 @@ func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Du
}
type executor struct {
notifiers func() []notifier.Notifier
rw *remotewrite.Client
notifiers func() []notifier.Notifier
notifierHeaders map[string]string
rw *remotewrite.Client
previouslySentSeriesToRWMu sync.Mutex
// previouslySentSeriesToRW stores series sent to RW on previous iteration
@@ -504,7 +516,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
for _, nt := range e.notifiers() {
wg.Add(1)
go func(nt notifier.Notifier) {
if err := nt.Send(ctx, alerts); err != nil {
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
}
wg.Done()

View File

@@ -44,21 +44,21 @@ func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Qu
return fq
}
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) ([]datasource.Metric, error) {
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) ([]datasource.Metric, *http.Request, error) {
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) (datasource.Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return nil, nil, fq.err
return datasource.Result{}, nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return cp, req, nil
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithRegistry struct {
@@ -85,23 +85,23 @@ func (fqr *fakeQuerierWithRegistry) BuildWithParams(_ datasource.QuerierParams)
return fqr
}
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) ([]datasource.Metric, error) {
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) ([]datasource.Metric, *http.Request, error) {
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (datasource.Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return nil, req, nil
return datasource.Result{}, req, nil
}
cp := make([]datasource.Metric, len(metrics))
copy(cp, metrics)
return cp, req, nil
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithDelay struct {
@@ -109,7 +109,7 @@ type fakeQuerierWithDelay struct {
delay time.Duration
}
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) ([]datasource.Metric, *http.Request, error) {
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (datasource.Result, *http.Request, error) {
timer := time.NewTimer(fqd.delay)
select {
case <-ctx.Done():
@@ -131,7 +131,7 @@ type fakeNotifier struct {
func (*fakeNotifier) Close() {}
func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
@@ -155,7 +155,7 @@ type faultyNotifier struct {
fakeNotifier
}
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert) error {
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))

View File

@@ -10,6 +10,8 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
@@ -24,15 +26,16 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
"github.com/VictoriaMetrics/metrics"
)
var (
rulePath = flagutil.NewArrayString("rule", `Path to the files with alerting and/or recording rules.
rulePath = flagutil.NewArrayString("rule", `Path to the files or http url with alerting and/or recording rules.
Supports hierarchical patterns and regexpes.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="/path/to/file". Path to a single file with alerting rules.
-rule="http://<some-server-addr>/path/to/rules". HTTP URL to a page with alerting rules.
-rule="dir/*.yaml" -rule="/*.yaml" -rule="gcs://vmalert-rules/tenant_%{TENANT_ID}/prod".
-rule="dir/**/*.yaml". Includes all the .yaml files in "dir" subfolders recursively.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Enterprise version of vmalert supports S3 and GCS paths to rules.
@@ -47,13 +50,15 @@ See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-stor
Examples:
-rule.templates="/path/to/file". Path to a single file with go templates
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
absolute path to all .tpl files in root.`)
absolute path to all .tpl files in root.
-rule.templates="dir/**/*.tpl". Includes all the .tpl files in "dir" subfolders recursively.
`)
rulesCheckInterval = flag.Duration("rule.configCheckInterval", 0, "Interval for checking for changes in '-rule' files. "+
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead")
"By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead")
configCheckInterval = flag.Duration("configCheckInterval", 0, "Interval for checking for changes in '-rule' or '-notifier.config' files. "+
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes.")
"By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections. See also -httpListenAddr.useProxyProtocol")
useProxyProtocol = flag.Bool("httpListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted at -httpListenAddr . "+
@@ -67,13 +72,14 @@ absolute path to all .tpl files in root.`)
"which by default is 4 times evaluationInterval of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overriden per rule via update_entries_limit param.")
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
`for cases where you want to build a custom link to Grafana, Prometheus or any other service. `+
`Supports templating - see https://docs.victoriametrics.com/vmalert.html#templating . `+
`For example, link to Grafana: -external.alert.source='explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":{{$expr|jsonEscape|queryEscape}} },{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]' . `+
`For example, link to Grafana: -external.alert.source='explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{$expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"now-1h","to":"now"}}'. `+
`Link to VMUI: -external.alert.source='vmui/#/?g0.expr={{.Expr|queryEscape}}'. `+
`If empty 'vmalert/alert?group_id={{.GroupID}}&alert_id={{.AlertID}}' is used.`)
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
@@ -281,7 +287,10 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
"tpl": externalAlertSource,
}
return func(alert notifier.Alert) string {
templated, err := alert.ExecTemplate(nil, alert.Labels, m)
qFn := func(query string) ([]datasource.Metric, error) {
return nil, fmt.Errorf("`query` template isn't supported for alert source template")
}
templated, err := alert.ExecTemplate(qFn, alert.Labels, m)
if err != nil {
logger.Errorf("can not exec source template %s", err)
}
@@ -319,6 +328,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
// init reload metrics with positive values to improve alerting conditions
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
parseFn := config.Parse
for {
select {
case <-ctx.Done():
@@ -330,7 +340,11 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
}
logger.Infof("SIGHUP received. Going to reload rules %q %s...", *rulePath, tmplMsg)
configReloads.Inc()
// allow logs emitting during manual config reload
parseFn = config.Parse
case <-configCheckCh:
// disable logs emitting during per-interval config reload
parseFn = config.ParseSilent
}
if err := notifier.Reload(); err != nil {
configReloadErrors.Inc()
@@ -345,7 +359,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
logger.Errorf("failed to load new templates: %s", err)
continue
}
newGroupsCfg, err := config.ParseSilent(*rulePath, validateTplFn, *validateExpressions)
newGroupsCfg, err := parseFn(*rulePath, validateTplFn, *validateExpressions)
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)

View File

@@ -176,16 +176,19 @@ func (g *Group) toAPI() APIGroup {
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
Labels: g.Labels,
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]APIRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, r.ToAPI())
}

View File

@@ -111,11 +111,7 @@ func (a *Alert) ExecTemplate(q templates.QueryFn, labels, annotations map[string
ActiveAt: a.ActiveAt,
For: a.For,
}
tmpl, err := templates.GetWithFuncs(templates.FuncsWithQuery(q))
if err != nil {
return nil, fmt.Errorf("error getting a template: %w", err)
}
return templateAnnotations(annotations, tplData, tmpl, true)
return ExecTemplate(q, annotations, tplData)
}
// ExecTemplate executes the given template for given annotations map.
@@ -174,6 +170,8 @@ func templateAnnotation(dst io.Writer, text string, data tplData, tmpl *textTpl.
if err != nil {
return fmt.Errorf("error cloning template before parse annotation: %w", err)
}
// Clone() doesn't copy tpl Options, so we set them manually
tpl = tpl.Option("missingkey=zero")
tpl, err = tpl.Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation template: %w", err)

View File

@@ -200,6 +200,9 @@ func TestAlert_ExecTemplate(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if err := ValidateTemplates(tc.annotations); err != nil {
t.Fatal(err)
}
tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations)
if err != nil {
t.Fatal(err)

View File

@@ -51,16 +51,16 @@ func (am *AlertManager) Close() {
func (am AlertManager) Addr() string { return am.addr }
// Send an alert or resolve message
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
func (am *AlertManager) Send(ctx context.Context, alerts []Alert, headers map[string]string) error {
am.metrics.alertsSent.Add(len(alerts))
err := am.send(ctx, alerts)
err := am.send(ctx, alerts, headers)
if err != nil {
am.metrics.alertsSendErrors.Add(len(alerts))
}
return err
}
func (am *AlertManager) send(ctx context.Context, alerts []Alert) error {
func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[string]string) error {
b := &bytes.Buffer{}
writeamRequest(b, alerts, am.argFunc, am.relabelConfigs)
@@ -69,6 +69,9 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert) error {
return err
}
req.Header.Set("Content-Type", "application/json")
for key, value := range headers {
req.Header.Set(key, value)
}
if am.timeout > 0 {
var cancel context.CancelFunc
@@ -105,7 +108,8 @@ const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, authCfg promauth.HTTPClientConfig,
relabelCfg *promrelabel.ParsedConfigs, timeout time.Duration) (*AlertManager, error) {
relabelCfg *promrelabel.ParsedConfigs, timeout time.Duration,
) (*AlertManager, error) {
tls := &promauth.TLSConfig{}
if authCfg.TLSConfig != nil {
tls = authCfg.TLSConfig

View File

@@ -25,6 +25,7 @@ func TestAlertManager_Addr(t *testing.T) {
func TestAlertManager_Send(t *testing.T) {
const baUser, baPass = "foo", "bar"
const headerKey, headerValue = "TenantID", "foo"
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
@@ -73,6 +74,11 @@ func TestAlertManager_Send(t *testing.T) {
if a[0].EndAt.IsZero() {
t.Errorf("expected non-zero end time")
}
case 3:
if r.Header.Get(headerKey) != headerValue {
t.Errorf("expected header %q to be set to %q; got %q instead",
headerKey, headerValue, r.Header.Get(headerKey))
}
}
})
srv := httptest.NewServer(mux)
@@ -90,10 +96,10 @@ func TestAlertManager_Send(t *testing.T) {
if err != nil {
t.Errorf("unexpected error: %s", err)
}
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
if err := am.Send(context.Background(), []Alert{{}, {}}, nil); err == nil {
t.Error("expected connection error got nil")
}
if err := am.Send(context.Background(), []Alert{}); err == nil {
if err := am.Send(context.Background(), []Alert{}, nil); err == nil {
t.Error("expected wrong http code error got nil")
}
if err := am.Send(context.Background(), []Alert{{
@@ -102,10 +108,13 @@ func TestAlertManager_Send(t *testing.T) {
Start: time.Now().UTC(),
End: time.Now().UTC(),
Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
}}); err != nil {
}}, nil); err != nil {
t.Errorf("unexpected error %s", err)
}
if c != 2 {
t.Errorf("expected 2 calls(count from zero) to server got %d", c)
}
if err := am.Send(context.Background(), nil, map[string]string{headerKey: headerValue}); err != nil {
t.Errorf("unexpected error %s", err)
}
}

View File

@@ -31,9 +31,9 @@ var (
tlsCertFile = flagutil.NewArrayString("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
tlsKeyFile = flagutil.NewArrayString("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
tlsCAFile = flagutil.NewArrayString("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
"By default system CA is used")
"By default, system CA is used")
tlsServerName = flagutil.NewArrayString("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
"By default the server name from -notifier.url is used")
"By default, the server name from -notifier.url is used")
oauth2ClientID = flagutil.NewArrayString("notifier.oauth2.clientID", "Optional OAuth2 clientID to use for -notifier.url. "+
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
@@ -81,10 +81,6 @@ var (
//
// Init returns an error if both mods are used.
func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (func() []Notifier, error) {
if externalLabels != nil || externalURL != "" {
return nil, fmt.Errorf("BUG: notifier.Init was called multiple times")
}
externalURL = extURL
externalLabels = extLabels
eu, err := url.Parse(externalURL)

View File

@@ -0,0 +1,37 @@
package notifier
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"testing"
)
func TestInit(t *testing.T) {
oldAddrs := *addrs
defer func() { *addrs = oldAddrs }()
*addrs = flagutil.ArrayString{"127.0.0.1", "127.0.0.2"}
fn, err := Init(nil, nil, "")
if err != nil {
t.Fatalf("%s", err)
}
nfs := fn()
if len(nfs) != 2 {
t.Fatalf("expected to get 2 notifiers; got %d", len(nfs))
}
targets := GetTargets()
if targets == nil || targets[TargetStatic] == nil {
t.Fatalf("expected to get static targets in response")
}
nf1 := targets[TargetStatic][0]
if nf1.Addr() != "127.0.0.1/api/v2/alerts" {
t.Fatalf("expected to get \"127.0.0.1/api/v2/alerts\"; got %q instead", nf1.Addr())
}
nf2 := targets[TargetStatic][1]
if nf2.Addr() != "127.0.0.2/api/v2/alerts" {
t.Fatalf("expected to get \"127.0.0.2/api/v2/alerts\"; got %q instead", nf2.Addr())
}
}

View File

@@ -7,7 +7,7 @@ type Notifier interface {
// Send sends the given list of alerts.
// Returns an error if fails to send the alerts.
// Must unblock if the given ctx is cancelled.
Send(ctx context.Context, alerts []Alert) error
Send(ctx context.Context, alerts []Alert, notifierHeaders map[string]string) error
// Addr returns address where alerts are sent.
Addr() string
// Close is a destructor for the Notifier

View File

@@ -99,13 +99,13 @@ func (rr *RecordingRule) Close() {
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
series, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil {
return nil, err
}
duplicates := make(map[string]struct{}, len(series))
duplicates := make(map[string]struct{}, len(res.Data))
var tss []prompbmarshal.TimeSeries
for _, s := range series {
for _, s := range res.Data {
ts := rr.toTimeSeries(s)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
@@ -120,13 +120,14 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
qMetrics, req, err := rr.q.Query(ctx, rr.Expr, ts)
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(qMetrics),
curl: requestToCurl(req),
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
curl: requestToCurl(req),
}
defer func() {
@@ -138,6 +139,7 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
return nil, curState.err
}
qMetrics := res.Data
numSeries := len(qMetrics)
if limit > 0 && numSeries > limit {
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
@@ -208,17 +210,18 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),

View File

@@ -34,9 +34,9 @@ var (
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
"By default system CA is used")
"By default, system CA is used")
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
"By default the server name from -remoteRead.url is used")
"By default, the server name from -remoteRead.url is used")
oauth2ClientID = flag.String("remoteRead.oauth2.clientID", "", "Optional OAuth2 clientID to use for -remoteRead.url.")
oauth2ClientSecret = flag.String("remoteRead.oauth2.clientSecret", "", "Optional OAuth2 clientSecret to use for -remoteRead.url.")

View File

@@ -29,7 +29,7 @@ var (
bearerTokenFile = flag.String("remoteWrite.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteWrite.url.")
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines max number of timeseries to be flushed at once")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
@@ -37,9 +37,9 @@ var (
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
"By default, system CA is used")
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
"By default, the server name from -remoteWrite.url is used")
oauth2ClientID = flag.String("remoteWrite.oauth2.clientID", "", "Optional OAuth2 clientID to use for -remoteWrite.url.")
oauth2ClientSecret = flag.String("remoteWrite.oauth2.clientSecret", "", "Optional OAuth2 clientSecret to use for -remoteWrite.url.")

View File

@@ -0,0 +1,20 @@
package remotewrite
import (
"context"
"testing"
)
func TestInit(t *testing.T) {
oldAddr := *addr
defer func() { *addr = oldAddr }()
*addr = "http://localhost:8428"
cl, err := Init(context.Background())
if err != nil {
t.Fatal(err)
}
if err := cl.Close(); err != nil {
t.Fatal(err)
}
}

View File

@@ -23,6 +23,8 @@ import (
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
@@ -147,6 +149,7 @@ func (c *Client) run(ctx context.Context) {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
@@ -180,13 +183,18 @@ func (c *Client) run(ctx context.Context) {
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote write endpoint. Flush performs limited amount of retries
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
@@ -201,9 +209,16 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
return
}
const attempts = 5
b := snappy.Encode(nil, data)
for i := 0; i < attempts; i++ {
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer sendDuration.Add(time.Since(timeStart).Seconds())
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
@@ -211,16 +226,41 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
return
}
logger.Warnf("attempt %d to send request failed: %s", i+1, err)
// sleeping to avoid remote db hammering
time.Sleep(time.Second)
continue
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("all %d attempts to send request failed - dropping %d time series",
attempts, len(wr.Timeseries))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
@@ -249,10 +289,32 @@ func (c *Client) send(ctx context.Context, data []byte) error {
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
return nil
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
}

View File

@@ -7,6 +7,7 @@ import (
"math/rand"
"net/http"
"net/http/httptest"
"sync"
"sync/atomic"
"testing"
"time"
@@ -18,15 +19,30 @@ import (
)
func TestClient_Push(t *testing.T) {
oldMinInterval := *retryMinInterval
*retryMinInterval = time.Millisecond * 10
defer func() {
*retryMinInterval = oldMinInterval
}()
testSrv := newRWServer()
cfg := Config{
client, err := NewClient(context.Background(), Config{
Addr: testSrv.URL,
MaxBatchSize: 100,
}
client, err := NewClient(context.Background(), cfg)
})
if err != nil {
t.Fatalf("failed to create client: %s", err)
}
faultySrv := newFaultyRWServer()
faultyClient, err := NewClient(context.Background(), Config{
Addr: faultySrv.URL,
MaxBatchSize: 50,
})
if err != nil {
t.Fatalf("failed to create faulty client: %s", err)
}
r := rand.New(rand.NewSource(1))
const rowsN = 1e4
var sent int
@@ -38,9 +54,16 @@ func TestClient_Push(t *testing.T) {
}},
}
err := client.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
if err == nil {
sent++
}
err = faultyClient.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
}
if sent == 0 {
t.Fatalf("0 series sent")
@@ -48,10 +71,17 @@ func TestClient_Push(t *testing.T) {
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
if err := faultyClient.Close(); err != nil {
t.Fatalf("failed to close faulty client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
got = faultySrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series for faulty client; got %d", sent, got)
}
}
func newRWServer() *rwServer {
@@ -117,3 +147,42 @@ func (rw *rwServer) handler(w http.ResponseWriter, r *http.Request) {
atomic.AddUint64(&rw.acceptedRows, uint64(len(wr.Timeseries)))
w.WriteHeader(http.StatusNoContent)
}
// faultyRWServer sometimes respond with 5XX status code
// or just closes the connection. Is used for testing retries.
type faultyRWServer struct {
*rwServer
reqsMu sync.Mutex
reqs int
}
func newFaultyRWServer() *faultyRWServer {
rw := &faultyRWServer{
rwServer: &rwServer{},
}
rw.Server = httptest.NewServer(http.HandlerFunc(rw.handler))
return rw
}
func (frw *faultyRWServer) handler(w http.ResponseWriter, r *http.Request) {
frw.reqsMu.Lock()
reqs := frw.reqs
frw.reqs++
if frw.reqs > 5 {
frw.reqs = 0
}
frw.reqsMu.Unlock()
switch reqs {
case 0, 1, 2, 3:
frw.rwServer.handler(w, r)
case 4:
hj, _ := w.(http.Hijacker)
conn, _, _ := hj.Hijack()
conn.Close()
case 5:
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte("server overloaded"))
}
}

View File

@@ -22,7 +22,7 @@ var (
replayTo = flag.String("replay.timeTo", "",
"The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'")
replayRulesDelay = flag.Duration("replay.rulesDelay", time.Second,
"Delay between rules evaluation within the group. Could be important if there are chained rules inside of the group"+
"Delay between rules evaluation within the group. Could be important if there are chained rules inside the group "+
"and processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule."+
"Keep it equal or bigger than -remoteWrite.flushInterval.")
replayMaxDatapoints = flag.Int("replay.maxDatapointsPerQuery", 1e3,

View File

@@ -20,21 +20,21 @@ func (fr *fakeReplayQuerier) BuildWithParams(_ datasource.QuerierParams) datasou
return fr
}
func (fr *fakeReplayQuerier) QueryRange(_ context.Context, q string, from, to time.Time) ([]datasource.Metric, error) {
func (fr *fakeReplayQuerier) QueryRange(_ context.Context, q string, from, to time.Time) (res datasource.Result, err error) {
key := fmt.Sprintf("%s+%s", from.Format("15:04:05"), to.Format("15:04:05"))
dps, ok := fr.registry[q]
if !ok {
return nil, fmt.Errorf("unexpected query received: %q", q)
return res, fmt.Errorf("unexpected query received: %q", q)
}
_, ok = dps[key]
if !ok {
return nil, fmt.Errorf("unexpected time range received: %q", key)
return res, fmt.Errorf("unexpected time range received: %q", key)
}
delete(dps, key)
if len(fr.registry[q]) < 1 {
delete(fr.registry, q)
}
return nil, nil
return res, nil
}
func TestReplay(t *testing.T) {

View File

@@ -53,6 +53,12 @@ type ruleStateEntry struct {
// stores the number of samples returned during
// the last evaluation
samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
seriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
curl string
}

View File

@@ -0,0 +1,39 @@
function expandAll() {
$('.collapse').addClass('show');
}
function collapseAll() {
$('.collapse').removeClass('show');
}
function toggleByID(id) {
let el = $("#" + id);
if (el.length > 0) {
el.click();
}
}
$(document).ready(function () {
$(".group-heading a").click(function (e) {
e.stopPropagation(); // prevent collapse logic on link click
let target = $(this).attr('href');
if (target.length > 0) {
toggleByID(target.substr(1));
}
});
$(".group-heading").click(function (e) {
let target = $(this).attr('data-bs-target');
let el = $("#" + target);
new bootstrap.Collapse(el, {
toggle: true
});
});
let hash = window.location.hash.substr(1);
toggleByID(hash);
});
$(document).ready(function () {
$('[data-bs-toggle="tooltip"]').tooltip();
});

View File

@@ -21,7 +21,6 @@ import (
"math"
"net"
"net/url"
"path/filepath"
"regexp"
"sort"
"strconv"
@@ -30,6 +29,8 @@ import (
textTpl "text/template"
"time"
"github.com/bmatcuk/doublestar/v4"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/formatutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
@@ -59,12 +60,12 @@ func Load(pathPatterns []string, overwrite bool) error {
var err error
tmpl := newTemplate()
for _, tp := range pathPatterns {
p, err := filepath.Glob(tp)
p, err := doublestar.FilepathGlob(tp)
if err != nil {
return fmt.Errorf("failed to retrieve a template glob %q: %w", tp, err)
}
if len(p) > 0 {
tmpl, err = tmpl.ParseGlob(tp)
tmpl, err = tmpl.ParseFiles(p...)
if err != nil {
return fmt.Errorf("failed to parse template glob %q: %w", tp, err)
}
@@ -182,6 +183,10 @@ func Get() (*textTpl.Template, error) {
func FuncsWithQuery(query QueryFn) textTpl.FuncMap {
return textTpl.FuncMap{
"query": func(q string) ([]metric, error) {
if query == nil {
return nil, fmt.Errorf("cannot execute query %q: query is not available in this context", q)
}
result, err := query(q)
if err != nil {
return nil, err

View File

@@ -76,6 +76,20 @@ func TestTemplateFuncs(t *testing.T) {
formatting("humanize1024", float64(146521335255970361638912), "124.1Zi")
formatting("humanize1024", float64(150037847302113650318245888), "124.1Yi")
formatting("humanize1024", float64(153638755637364377925883789312), "1.271e+05Yi")
formatting("humanize", float64(127087), "127.1k")
formatting("humanize", float64(136458627186688), "136.5T")
formatting("humanizeDuration", 1, "1s")
formatting("humanizeDuration", 0.2, "200ms")
formatting("humanizeDuration", 42000, "11h 40m 0s")
formatting("humanizeDuration", 16790555, "194d 8h 2m 35s")
formatting("humanizePercentage", 1, "100%")
formatting("humanizePercentage", 0.8, "80%")
formatting("humanizePercentage", 0.015, "1.5%")
formatting("humanizeTimestamp", 1679055557, "2023-03-17 12:19:17 +0000 UTC")
}
func mkTemplate(current, replacement interface{}) textTemplate {

View File

@@ -10,35 +10,7 @@
</main>
<script src="{%s prefix %}static/js/jquery-3.6.0.min.js" type="text/javascript"></script>
<script src="{%s prefix %}static/js/bootstrap.bundle.min.js" type="text/javascript"></script>
<script type="text/javascript">
function expandAll() {
$('.collapse').addClass('show');
}
function collapseAll() {
$('.collapse').removeClass('show');
}
$(document).ready(function() {
// prevent collapse logic on link click
$(".group-heading a").click(function(e) {
e.stopPropagation();
});
$(".group-heading").click(function(e) {
let target = $(this).attr('data-bs-target');
let el = $("#"+target);
new bootstrap.Collapse(el, {
toggle: true
});
});
var hash = window.location.hash.substr(1);
let group = $("#"+hash);
if (group.length > 0) {
group.click();
}
});
</script>
<script src="{%s prefix %}static/js/custom.js" type="text/javascript"></script>
</body>
</html>
{% endfunc %}

View File

@@ -45,63 +45,39 @@ func StreamFooter(qw422016 *qt422016.Writer, r *http.Request) {
qw422016.E().S(prefix)
//line app/vmalert/tpl/footer.qtpl:12
qw422016.N().S(`static/js/bootstrap.bundle.min.js" type="text/javascript"></script>
<script type="text/javascript">
function expandAll() {
$('.collapse').addClass('show');
}
function collapseAll() {
$('.collapse').removeClass('show');
}
$(document).ready(function() {
// prevent collapse logic on link click
$(".group-heading a").click(function(e) {
e.stopPropagation();
});
$(".group-heading").click(function(e) {
let target = $(this).attr('data-bs-target');
let el = $("#"+target);
new bootstrap.Collapse(el, {
toggle: true
});
});
var hash = window.location.hash.substr(1);
let group = $("#"+hash);
if (group.length > 0) {
group.click();
}
});
</script>
<script src="`)
//line app/vmalert/tpl/footer.qtpl:13
qw422016.E().S(prefix)
//line app/vmalert/tpl/footer.qtpl:13
qw422016.N().S(`static/js/custom.js" type="text/javascript"></script>
</body>
</html>
`)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
}
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
func WriteFooter(qq422016 qtio422016.Writer, r *http.Request) {
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
StreamFooter(qw422016, r)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
}
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
func Footer(r *http.Request) string {
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
WriteFooter(qb422016, r)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
qs422016 := string(qb422016.B)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
return qs422016
//line app/vmalert/tpl/footer.qtpl:44
//line app/vmalert/tpl/footer.qtpl:16
}

View File

@@ -1,5 +1,4 @@
{% import (
"strings"
"net/http"
"path"
"net/url"
@@ -85,10 +84,7 @@ type NavItem struct {
{% func printNavItems(r *http.Request, current string, items []NavItem) %}
{%code
prefix := "/vmalert/"
if strings.HasPrefix(r.URL.Path, prefix) {
prefix = ""
}
prefix := utils.Prefix(r.URL.Path)
%}
<nav class="navbar navbar-expand-md navbar-dark fixed-top bg-dark">
<div class="container-fluid">

View File

@@ -9,52 +9,51 @@ import (
"net/http"
"net/url"
"path"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
//line app/vmalert/tpl/header.qtpl:10
//line app/vmalert/tpl/header.qtpl:9
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmalert/tpl/header.qtpl:10
//line app/vmalert/tpl/header.qtpl:9
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmalert/tpl/header.qtpl:10
//line app/vmalert/tpl/header.qtpl:9
func StreamHeader(qw422016 *qt422016.Writer, r *http.Request, navItems []NavItem, title string) {
//line app/vmalert/tpl/header.qtpl:10
//line app/vmalert/tpl/header.qtpl:9
qw422016.N().S(`
`)
//line app/vmalert/tpl/header.qtpl:11
//line app/vmalert/tpl/header.qtpl:10
prefix := utils.Prefix(r.URL.Path)
//line app/vmalert/tpl/header.qtpl:11
//line app/vmalert/tpl/header.qtpl:10
qw422016.N().S(`
<!DOCTYPE html>
<html lang="en">
<head>
<title>vmalert`)
//line app/vmalert/tpl/header.qtpl:15
//line app/vmalert/tpl/header.qtpl:14
if title != "" {
//line app/vmalert/tpl/header.qtpl:15
//line app/vmalert/tpl/header.qtpl:14
qw422016.N().S(` - `)
//line app/vmalert/tpl/header.qtpl:15
//line app/vmalert/tpl/header.qtpl:14
qw422016.E().S(title)
//line app/vmalert/tpl/header.qtpl:15
//line app/vmalert/tpl/header.qtpl:14
}
//line app/vmalert/tpl/header.qtpl:15
//line app/vmalert/tpl/header.qtpl:14
qw422016.N().S(`</title>
<link href="`)
//line app/vmalert/tpl/header.qtpl:16
//line app/vmalert/tpl/header.qtpl:15
qw422016.E().S(prefix)
//line app/vmalert/tpl/header.qtpl:16
//line app/vmalert/tpl/header.qtpl:15
qw422016.N().S(`static/css/bootstrap.min.css" rel="stylesheet" />
<style>
body{
@@ -114,139 +113,136 @@ func StreamHeader(qw422016 *qt422016.Writer, r *http.Request, navItems []NavItem
</head>
<body>
`)
//line app/vmalert/tpl/header.qtpl:74
//line app/vmalert/tpl/header.qtpl:73
streamprintNavItems(qw422016, r, title, navItems)
//line app/vmalert/tpl/header.qtpl:74
//line app/vmalert/tpl/header.qtpl:73
qw422016.N().S(`
<main class="px-2">
`)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
}
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
func WriteHeader(qq422016 qtio422016.Writer, r *http.Request, navItems []NavItem, title string) {
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
StreamHeader(qw422016, r, navItems, title)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
}
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
func Header(r *http.Request, navItems []NavItem, title string) string {
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
WriteHeader(qb422016, r, navItems, title)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
qs422016 := string(qb422016.B)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
return qs422016
//line app/vmalert/tpl/header.qtpl:76
//line app/vmalert/tpl/header.qtpl:75
}
//line app/vmalert/tpl/header.qtpl:80
//line app/vmalert/tpl/header.qtpl:79
type NavItem struct {
Name string
Url string
}
//line app/vmalert/tpl/header.qtpl:86
//line app/vmalert/tpl/header.qtpl:85
func streamprintNavItems(qw422016 *qt422016.Writer, r *http.Request, current string, items []NavItem) {
//line app/vmalert/tpl/header.qtpl:86
//line app/vmalert/tpl/header.qtpl:85
qw422016.N().S(`
`)
//line app/vmalert/tpl/header.qtpl:88
prefix := "/vmalert/"
if strings.HasPrefix(r.URL.Path, prefix) {
prefix = ""
}
//line app/vmalert/tpl/header.qtpl:87
prefix := utils.Prefix(r.URL.Path)
//line app/vmalert/tpl/header.qtpl:92
//line app/vmalert/tpl/header.qtpl:88
qw422016.N().S(`
<nav class="navbar navbar-expand-md navbar-dark fixed-top bg-dark">
<div class="container-fluid">
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto mb-2 mb-md-0">
`)
//line app/vmalert/tpl/header.qtpl:97
//line app/vmalert/tpl/header.qtpl:93
for _, item := range items {
//line app/vmalert/tpl/header.qtpl:97
//line app/vmalert/tpl/header.qtpl:93
qw422016.N().S(`
<li class="nav-item">
`)
//line app/vmalert/tpl/header.qtpl:100
//line app/vmalert/tpl/header.qtpl:96
u, _ := url.Parse(item.Url)
//line app/vmalert/tpl/header.qtpl:101
//line app/vmalert/tpl/header.qtpl:97
qw422016.N().S(`
<a class="nav-link`)
//line app/vmalert/tpl/header.qtpl:102
//line app/vmalert/tpl/header.qtpl:98
if current == item.Name {
//line app/vmalert/tpl/header.qtpl:102
//line app/vmalert/tpl/header.qtpl:98
qw422016.N().S(` active`)
//line app/vmalert/tpl/header.qtpl:102
//line app/vmalert/tpl/header.qtpl:98
}
//line app/vmalert/tpl/header.qtpl:102
//line app/vmalert/tpl/header.qtpl:98
qw422016.N().S(`"
href="`)
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
if u.IsAbs() {
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
qw422016.E().S(item.Url)
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
} else {
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
qw422016.E().S(path.Join(prefix, item.Url))
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
}
//line app/vmalert/tpl/header.qtpl:103
//line app/vmalert/tpl/header.qtpl:99
qw422016.N().S(`">
`)
//line app/vmalert/tpl/header.qtpl:104
//line app/vmalert/tpl/header.qtpl:100
qw422016.E().S(item.Name)
//line app/vmalert/tpl/header.qtpl:104
//line app/vmalert/tpl/header.qtpl:100
qw422016.N().S(`
</a>
</li>
`)
//line app/vmalert/tpl/header.qtpl:107
//line app/vmalert/tpl/header.qtpl:103
}
//line app/vmalert/tpl/header.qtpl:107
//line app/vmalert/tpl/header.qtpl:103
qw422016.N().S(`
</ul>
</div>
</nav>
`)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
}
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
func writeprintNavItems(qq422016 qtio422016.Writer, r *http.Request, current string, items []NavItem) {
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
streamprintNavItems(qw422016, r, current, items)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
}
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
func printNavItems(r *http.Request, current string, items []NavItem) string {
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
writeprintNavItems(qb422016, r, current, items)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
qs422016 := string(qb422016.B)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
return qs422016
//line app/vmalert/tpl/header.qtpl:111
//line app/vmalert/tpl/header.qtpl:107
}

View File

@@ -1,13 +1,24 @@
package utils
import "strings"
import (
"net/url"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
)
const prefix = "/vmalert/"
// Prefix returns "/vmalert/" prefix if it is missing in the path.
func Prefix(path string) string {
pp := httpserver.GetPathPrefix()
path = strings.TrimLeft(path, pp)
if strings.HasPrefix(path, prefix) {
return ""
return pp
}
return prefix
res, err := url.JoinPath(pp, prefix)
if err != nil {
return path
}
return res
}

View File

@@ -211,7 +211,7 @@ func (rh *requestHandler) groups() []APIGroup {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
var groups []APIGroup
groups := make([]APIGroup, 0)
for _, g := range rh.m.groups {
groups = append(groups, g.toAPI())
}
@@ -276,6 +276,7 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
lr.Data.Alerts = make([]*APIAlert, 0)
for _, g := range rh.m.groups {
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)

View File

@@ -30,102 +30,132 @@
{%= tpl.Footer(r) %}
{% endfunc %}
{% func ListGroups(r *http.Request, groups []APIGroup) %}
{% func buttonActive(filter, expValue string) %}
{% if filter != expValue %}
btn-secondary
{% else %}
btn-primary
{% endif %}
{% endfunc %}
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups") %}
{% if len(groups) > 0 %}
{%code
filter := r.URL.Query().Get("filter")
rOk := make(map[string]int)
rNotOk := make(map[string]int)
for _, g := range groups {
rNoMatch := make(map[string]int)
var groups []APIGroup
for _, g := range originGroups {
var rules []APIRule
for _, r := range g.Rules {
if r.LastError != "" {
rNotOk[g.ID]++
} else {
rOk[g.ID]++
}
if isNoMatch(r) {
rNoMatch[g.ID]++
}
if (filter == "unhealthy" && r.LastError == "") ||
(filter == "noMatch" && !isNoMatch(r)) {
continue
}
rules = append(rules, r)
}
if len(rules) > 0 {
g.Rules = rules
groups = append(groups, g)
}
}
%}
<a class="btn {%= buttonActive(filter, "") %}" role="button" onclick="window.location = window.location.pathname">All</a>
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
{% for _, g := range groups %}
<div class="group-heading{% if rNotOk[g.ID] > 0 %} alert-danger{% endif %}" data-bs-target="rules-{%s g.ID %}">
<span class="anchor" id="group-{%s g.ID %}"></span>
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%f.0 g.Interval %}s)</a>
{% if rNotOk[g.ID] > 0 %}<span class="badge bg-danger" title="Number of rules with status Error">{%d rNotOk[g.ID] %}</span> {% endif %}
<span class="badge bg-success" title="Number of rules withs status Ok">{%d rOk[g.ID] %}</span>
<p class="fs-6 fw-lighter">{%s g.File %}</p>
{% if len(g.Params) > 0 %}
<div class="fs-6 fw-lighter">Extra params
{% for _, param := range g.Params %}
<span class="float-left badge bg-primary">{%s param %}</span>
{% endfor %}
</div>
{% endif %}
{% if len(g.Headers) > 0 %}
<div class="fs-6 fw-lighter">Extra headers
{% for _, header := range g.Headers %}
<span class="float-left badge bg-primary">{%s header %}</span>
{% endfor %}
</div>
{% endif %}
</div>
<div class="collapse" id="rules-{%s g.ID %}">
<table class="table table-striped table-hover table-sm">
<thead>
<tr>
<th scope="col" style="width: 60%">Rule</th>
<th scope="col" style="width: 20%" class="text-center" title="How many samples were produced by the rule">Samples</th>
<th scope="col" style="width: 20%" class="text-center" title="How many seconds ago rule was executed">Updated</th>
</tr>
</thead>
<tbody>
{% for _, r := range g.Rules %}
<tr{% if r.LastError != "" %} class="alert-danger"{% endif %}>
<td>
<div class="row">
<div class="col-12 mb-2">
{% if r.Type == "alerting" %}
<b>alert:</b> {%s r.Name %} (for: {%v r.Duration %} seconds)
{% else %}
<b>record:</b> {%s r.Name %}
{% endif %}
| <span><a target="_blank" href="{%s prefix+r.WebLink() %}">Details</a></span>
</div>
<div class="col-12">
<code><pre>{%s r.Query %}</pre></code>
</div>
<div class="col-12 mb-2">
{% if len(r.Labels) > 0 %} <b>Labels:</b>{% endif %}
{% for k, v := range r.Labels %}
<span class="ms-1 badge bg-primary">{%s k %}={%s v %}</span>
{% endfor %}
</div>
{% if r.LastError != "" %}
<div class="col-12">
<b>Error:</b>
<div class="error-cell">
{%s r.LastError %}
<a class="btn {%= buttonActive(filter, "unhealthy") %}" role="button" onclick="location.href='?filter=unhealthy'" title="Show only rules with errors">Unhealthy</a>
<a class="btn {%= buttonActive(filter, "noMatch") %}" role="button" onclick="location.href='?filter=noMatch'" title="Show only rules matching no time series during last evaluation">NoMatch</a>
{% if len(groups) > 0 %}
{% for _, g := range groups %}
<div
class="group-heading{% if rNotOk[g.ID] > 0 %} alert-danger{%endif%}" data-bs-target="rules-{%s g.ID %}">
<span class="anchor" id="group-{%s g.ID %}"></span>
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%f.0 g.Interval %}s) #</a>
{% if rNotOk[g.ID] > 0 %}<span class="badge bg-danger" title="Number of rules with status Error">{%d rNotOk[g.ID] %}</span> {% endif %}
{% if rNoMatch[g.ID] > 0 %}<span class="badge bg-warning" title="Number of rules with status NoMatch">{%d rNoMatch[g.ID] %}</span> {% endif %}
<span class="badge bg-success" title="Number of rules withs status Ok">{%d rOk[g.ID] %}</span>
<p class="fs-6 fw-lighter">{%s g.File %}</p>
{% if len(g.Params) > 0 %}
<div class="fs-6 fw-lighter">Extra params
{% for _, param := range g.Params %}
<span class="float-left badge bg-primary">{%s param %}</span>
{% endfor %}
</div>
{% endif %}
{% if len(g.Headers) > 0 %}
<div class="fs-6 fw-lighter">Extra headers
{% for _, header := range g.Headers %}
<span class="float-left badge bg-primary">{%s header %}</span>
{% endfor %}
</div>
{% endif %}
</div>
<div class="collapse" id="rules-{%s g.ID %}">
<table class="table table-striped table-hover table-sm">
<thead>
<tr>
<th scope="col" style="width: 60%">Rule</th>
<th scope="col" style="width: 20%" class="text-center" title="How many samples were produced by the rule">Samples</th>
<th scope="col" style="width: 20%" class="text-center" title="How many seconds ago rule was executed">Updated</th>
</tr>
</thead>
<tbody>
{% for _, r := range g.Rules %}
<tr{% if r.LastError != "" %} class="alert-danger"{% endif %}>
<td>
<div class="row">
<div class="col-12 mb-2">
{% if r.Type == "alerting" %}
<b>alert:</b> {%s r.Name %} (for: {%v r.Duration %} seconds)
{% else %}
<b>record:</b> {%s r.Name %}
{% endif %}
|
{%= seriesFetchedWarn(r) %}
<span><a target="_blank" href="{%s prefix+r.WebLink() %}">Details</a></span>
</div>
<div class="col-12">
<code><pre>{%s r.Query %}</pre></code>
</div>
<div class="col-12 mb-2">
{% if len(r.Labels) > 0 %} <b>Labels:</b>{% endif %}
{% for k, v := range r.Labels %}
<span class="ms-1 badge bg-primary">{%s k %}={%s v %}</span>
{% endfor %}
</div>
{% if r.LastError != "" %}
<div class="col-12">
<b>Error:</b>
<div class="error-cell">
{%s r.LastError %}
</div>
</div>
{% endif %}
</div>
{% endif %}
</div>
</td>
<td class="text-center">{%d r.LastSamples %}</td>
<td class="text-center">{%f.3 time.Since(r.LastEvaluation).Seconds() %}s ago</td>
</tr>
{% endfor %}
</tbody>
</table>
</td>
<td class="text-center">{%d r.LastSamples %}</td>
<td class="text-center">{%f.3 time.Since(r.LastEvaluation).Seconds() %}s ago</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endfor %}
{% else %}
<div>
<p>No groups...</p>
</div>
{% endfor %}
{% else %}
<div>
<p>No groups...</p>
</div>
{% endif %}
{% endif %}
{%= tpl.Footer(r) %}
@@ -237,9 +267,9 @@
{%code typeK, ns := keys[i], targets[notifier.TargetType(keys[i])]
count := len(ns)
%}
<div class="group-heading data-bs-target="rules-{%s typeK %}">
<span class="anchor" id="notifiers-{%s typeK %}"></span>
<a href="#notifiers-{%s typeK %}">{%s typeK %} ({%d count %})</a>
<div class="group-heading" data-bs-target="notifiers-{%s typeK %}">
<span class="anchor" id="group-{%s typeK %}"></span>
<a href="#group-{%s typeK %}">{%s typeK %} ({%d count %})</a>
</div>
<div class="collapse show" id="notifiers-{%s typeK %}">
<table class="table table-striped table-hover table-sm">
@@ -377,8 +407,20 @@
annotationKeys = append(annotationKeys, k)
}
sort.Strings(annotationKeys)
var seriesFetchedEnabled bool
var seriesFetchedWarning bool
for _, u := range rule.Updates {
if u.seriesFetched != nil {
seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0{
seriesFetchedWarning = true
}
}
}
%}
<div class="display-6 pb-3 mb-3">Rule: {%s rule.Name %}<span class="ms-2 badge {% if rule.Health!="ok" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s rule.Health %}</span></div>
<div class="display-6 pb-3 mb-3">Rule: {%s rule.Name %}<span class="ms-2 badge {% if rule.Health!="ok" %}bg-danger{% else %} bg-success text-dark{% endif %}">{%s rule.Health %}</span></div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
@@ -450,12 +492,26 @@
</div>
<br>
{% if seriesFetchedWarning %}
<div class="alert alert-warning" role="alert">
<strong>Warning:</strong> some of updates have "Series fetched" equal to 0.<br>
It might be that either this data is missing in the datasource or there is a typo in rule's expression.
For example, <strong>foo{label="bar"} > 0</strong> could never trigger because <strong>foo{label="bar"}</strong>
metric doesn't exist.
<br>
Rule's expressions without time series selector, like <strong>expr: 42</strong> or <strong>expr: time()</strong>
aren't fetching time series from datasource, so they could have "Series fetched" equal to 0 and this won't be a problem.
<br>
See more details about this detection <a target="_blank" href="https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039">here</a>.
</div>
{% endif %}
<div class="display-6 pb-3">Last {%d len(rule.Updates) %}/{%d rule.MaxUpdates %} updates</span>:</div>
<table class="table table-striped table-hover table-sm">
<thead>
<tr>
<th scope="col" title="The time when event was created">Updated at</th>
<th scope="col" style="width: 10%" class="text-center" title="How many samples were returned">Samples</th>
{% if seriesFetchedEnabled %}<th scope="col" style="width: 10%" class="text-center" title="How many series were scanned by datasource during the evaluation">Series fetched</th>{% endif %}
<th scope="col" style="width: 10%" class="text-center" title="How many seconds request took">Duration</th>
<th scope="col" class="text-center" title="Time used for rule execution">Executed at</th>
<th scope="col" class="text-center" title="cURL command with request example">cURL</th>
@@ -468,7 +524,8 @@
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
</td>
<td class="text-center" wi>{%d u.samples %}</td>
<td class="text-center">{%d u.samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.seriesFetched != nil %}{%d *u.seriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
<td>
@@ -478,7 +535,7 @@
</li>
{% if u.err != nil %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
<td colspan="5">
<td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}">
<span class="alert-danger">{%v u.err %}</span>
</td>
</tr>
@@ -503,3 +560,22 @@
{% func badgeRestored() %}
<span class="badge bg-warning text-dark" title="Alert state was restored after the service restart from remote storage">restored</span>
{% endfunc %}
{% func seriesFetchedWarn(r APIRule) %}
{% if isNoMatch(r) %}
<svg xmlns="http://www.w3.org/2000/svg"
data-bs-toggle="tooltip"
title="No match! This rule's last evaluation hasn't selected any time series from the datasource.
It might be that either this data is missing in the datasource or there is a typo in rule's expression.
See more in Details."
width="18" height="18" fill="currentColor" class="bi bi-exclamation-triangle-fill flex-shrink-0 me-2" viewBox="0 0 16 16" role="img" aria-label="Warning:">
<path d="M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16zm.93-9.412-1 4.705c-.07.34.029.533.304.533.194 0 .487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703 0-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381 2.29-.287zM8 5.5a1 1 0 1 1 0-2 1 1 0 0 1 0 2z"/>
</svg>
{% endif %}
{% endfunc %}
{%code
func isNoMatch (r APIRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
}
%}

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ import (
"net/http/httptest"
"reflect"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
@@ -19,9 +20,18 @@ func TestHandler(t *testing.T) {
},
state: newRuleState(10),
}
ar.state.add(ruleStateEntry{
time: time.Now(),
at: time.Now(),
samples: 10,
})
rr := &RecordingRule{
Name: "record",
state: newRuleState(10),
}
g := &Group{
Name: "group",
Rules: []Rule{ar},
Rules: []Rule{ar, rr},
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
@@ -62,6 +72,14 @@ func TestHandler(t *testing.T) {
t.Run("/vmalert/rule", func(t *testing.T) {
a := ar.ToAPI()
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
r := rr.ToAPI()
getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200)
})
t.Run("/vmalert/alert", func(t *testing.T) {
alerts := ar.AlertsToAPI()
for _, a := range alerts {
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
}
})
t.Run("/vmalert/rule?badParam", func(t *testing.T) {
params := fmt.Sprintf("?%s=0&%s=1", paramGroupID, paramRuleID)
@@ -144,5 +162,81 @@ func TestHandler(t *testing.T) {
t.Run("/api/v1/1/0/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
})
}
func TestEmptyResponse(t *testing.T) {
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*Group)}}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) }))
defer ts.Close()
getResp := func(url string, to interface{}, code int) {
t.Helper()
resp, err := http.Get(url)
if err != nil {
t.Fatalf("unexpected err %s", err)
}
if code != resp.StatusCode {
t.Errorf("unexpected status code %d want %d", resp.StatusCode, code)
}
defer func() {
if err := resp.Body.Close(); err != nil {
t.Errorf("err closing body %s", err)
}
}()
if to != nil {
if err = json.NewDecoder(resp.Body).Decode(to); err != nil {
t.Errorf("unexpected err %s", err)
}
}
}
t.Run("no groups /api/v1/alerts", func(t *testing.T) {
lr := listAlertsResponse{}
getResp(ts.URL+"/api/v1/alerts", &lr, 200)
if lr.Data.Alerts == nil {
t.Errorf("expected /api/v1/alerts response to have non-nil data")
}
lr = listAlertsResponse{}
getResp(ts.URL+"/vmalert/api/v1/alerts", &lr, 200)
if lr.Data.Alerts == nil {
t.Errorf("expected /api/v1/alerts response to have non-nil data")
}
})
t.Run("no groups /api/v1/rules", func(t *testing.T) {
lr := listGroupsResponse{}
getResp(ts.URL+"/api/v1/rules", &lr, 200)
if lr.Data.Groups == nil {
t.Errorf("expected /api/v1/rules response to have non-nil data")
}
lr = listGroupsResponse{}
getResp(ts.URL+"/vmalert/api/v1/rules", &lr, 200)
if lr.Data.Groups == nil {
t.Errorf("expected /api/v1/rules response to have non-nil data")
}
})
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*Group{0: {Name: "test"}}}}
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) })
t.Run("empty group /api/v1/rules", func(t *testing.T) {
lr := listGroupsResponse{}
getResp(ts.URL+"/api/v1/rules", &lr, 200)
if lr.Data.Groups == nil {
t.Fatalf("expected /api/v1/rules response to have non-nil data")
}
lr = listGroupsResponse{}
getResp(ts.URL+"/vmalert/api/v1/rules", &lr, 200)
if lr.Data.Groups == nil {
t.Fatalf("expected /api/v1/rules response to have non-nil data")
}
group := lr.Data.Groups[0]
if group.Rules == nil {
t.Fatalf("expected /api/v1/rules response to have non-nil rules for group")
}
})
}

View File

@@ -72,6 +72,8 @@ type APIGroup struct {
Params []string `json:"params,omitempty"`
// Headers contains HTTP headers added to each Rule's request
Headers []string `json:"headers,omitempty"`
// NotifierHeaders contains HTTP headers added to each alert request which will send to notifier
NotifierHeaders []string `json:"notifier_headers,omitempty"`
// Labels is a set of label value pairs, that will be added to every rule.
Labels map[string]string `json:"labels,omitempty"`
}
@@ -115,7 +117,12 @@ type APIRule struct {
// DatasourceType of the rule: prometheus or graphite
DatasourceType string `json:"datasourceType"`
LastSamples int `json:"lastSamples"`
// LastSamples stores the amount of data samples received on last evaluation
LastSamples int `json:"lastSamples"`
// LastSeriesFetched stores the amount of time series fetched by datasource
// during the last evaluation
LastSeriesFetched *int `json:"lastSeriesFetched,omitempty"`
// ID is a unique Alert's ID within a group
ID string `json:"id"`
// GroupID is an unique Group's ID

View File

@@ -17,7 +17,12 @@ and pass the following flag to `vmauth` binary in order to start authorizing and
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded either by passing `SIGHUP` signal to `vmauth` or by querying `/-/reload` http endpoint.
The auth config can be reloaded via the following ways:
- By passing `SIGHUP` signal to `vmauth`.
- By querying `/-/reload` http endpoint. This endpoint can be protected with `-reloadAuthKey` command-line flag. See [security docs](#security) for more details.
- By specifying `-configCheckInterval` command-line flag to the interval between config re-reads. For example, `-configCheckInterval=5s` will re-read the config
and apply new changes every 5 seconds.
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
@@ -57,8 +62,40 @@ The following [metrics](#monitoring) related to concurrency limits are exposed b
- `vmauth_user_concurrent_requests_current{username="..."}` - the current number of concurrent requests for the given `username`.
- `vmauth_user_concurrent_requests_limit_reached_total{username="foo"}` - the number of requests rejected with `429 Too Many Requests` error
because of the concurrency limit has been reached for the given `username`.
- `vmauth_unauthorized_user_concurrent_requests_capacity` - the limit on the number of concurrent requests for unauthorized users (if `unauthorized_user` section is used).
- `vmauth_unauthorized_user_concurrent_requests_current` - the current number of concurrent requests for unauthorized users (if `unauthorized_user` section is used).
- `vmauth_unauthorized_user_concurrent_requests_limit_reached_total` - the number of requests rejected with `429 Too Many Requests` error
because of the concurrency limit has been reached for unauthorized users (if `unauthorized_user` section is used).
## IP filters
[Enterprise version](https://docs.victoriametrics.com/enterprise.html) of `vmauth` can be configured to allow / deny incoming requests via global and per-user IP filters.
For example, the following config allows requests to `vmauth` from `10.0.0.0/24` network and from `1.2.3.4` IP address, while denying requests from `10.0.0.42` IP address:
```yml
users:
# User configs here
ip_filters:
allow_list:
- 10.0.0.0/24
- 1.2.3.4
deny_list: [10.0.0.42]
```
The following config allows requests for the user 'foobar' only from the ip `127.0.0.1`:
```yml
users:
- username: "foobar"
password: "***"
url_prefix: "http://localhost:8428"
ip_filters:
allow_list: [127.0.0.1]
```
## Auth config
`-auth.config` is represented in the following simple `yml` format:
@@ -126,14 +163,22 @@ users:
- "http://vminsert2:8480/insert/42/prometheus"
# A single user for querying and inserting data:
#
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/42/prometheus
# - http://vmselect2:8481/select/42/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
#
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write .
# The "X-Scope-OrgID: abc" http header is added to these requests.
#
# Request which do not match `src_paths` from the `url_map` are proxied to the urls from `default_url`
# in a round-robin manner. The original request path is passed in `request_path` query arg.
# For example, request to http://vmauth:8427/non/existing/path are proxied:
# - to http://default1:8888/unsupported_url_handler?request_path=/non/existing/path
# - or http://default2:8888/unsupported_url_handler?request_path=/non/existing/path
- username: "foobar"
url_map:
- src_paths:
@@ -147,6 +192,28 @@ users:
url_prefix: "http://vminsert:8480/insert/42/prometheus"
headers:
- "X-Scope-OrgID: abc"
ip_filters:
deny_list: [127.0.0.1]
default_url:
- "http://default1:8888/unsupported_url_handler"
- "http://default2:8888/unsupported_url_handler"
# Requests without Authorization header are routed according to `unauthorized_user` section.
unauthorized_user:
url_map:
- src_paths:
- /api/v1/query
- /api/v1/query_range
url_prefix:
- http://vmselect1:8481/select/0/prometheus
- http://vmselect2:8481/select/0/prometheus
ip_filters:
allow_list: [8.8.8.8]
ip_filters:
allow_list: ["1.2.3.0/24", "127.0.0.1"]
deny_list:
- 10.1.0.1
```
The config may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
@@ -169,12 +236,14 @@ Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
It is recommended protecting following endpoints with authKeys:
It is recommended protecting the following endpoints with authKeys:
* `/-/reload` with `-reloadAuthKey` command-line flag, so external users couldn't trigger config reload.
* `/flags` with `-flagsAuthkey` command-line flag, so unauthorized users couldn't get application command-line flags.
* `/metrics` with `metricsAuthkey` command-line flag, so unauthorized users couldn't get access to [vmauth metrics](#monitoring).
* `/debug/pprof` with `pprofAuthKey` command-line flag, so unauthorized users couldn't get access to [profiling information](#profiling).
`vmauth` also supports the ability to restict access by IP - see [these docs](#ip-filters). See also [concurrency limiting docs](#concurrency-limiting).
## Monitoring
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
@@ -189,6 +258,8 @@ users:
# other config options here
```
For unauthorized users `vmauth` exports `vmauth_unauthorized_user_requests_total` metric without label (if `unauthorized_user` section of config is used).
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
@@ -260,8 +331,10 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
-auth.config string
Path to auth config. It can point either to local file or to http url. See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config
-configCheckInterval duration
Interval for config file re-read. Zero value disables config re-reading. By default, refreshing is disabled, send SIGHUP for config refresh.
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -271,11 +344,11 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration

View File

@@ -11,6 +11,7 @@ import (
"strings"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
@@ -24,11 +25,14 @@ import (
var (
authConfigPath = flag.String("auth.config", "", "Path to auth config. It can point either to local file or to http url. "+
"See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config")
configCheckInterval = flag.Duration("configCheckInterval", 0, "interval for config file re-read. "+
"Zero value disables config re-reading. By default, refreshing is disabled, send SIGHUP for config refresh.")
)
// AuthConfig represents auth config.
type AuthConfig struct {
Users []UserInfo `yaml:"users,omitempty"`
Users []UserInfo `yaml:"users,omitempty"`
UnauthorizedUser *UserInfo `yaml:"unauthorized_user,omitempty"`
}
// UserInfo is user information read from authConfigPath
@@ -41,6 +45,7 @@ type UserInfo struct {
URLMaps []URLMap `yaml:"url_map,omitempty"`
Headers []Header `yaml:"headers,omitempty"`
MaxConcurrentRequests int `yaml:"max_concurrent_requests,omitempty"`
DefaultURL *URLPrefix `yaml:"default_url,omitempty"`
concurrencyLimitCh chan struct{}
concurrencyLimitReached *metrics.Counter
@@ -286,11 +291,11 @@ func initAuthConfig() {
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
sighupCh := procutil.NewSighupChan()
m, err := readAuthConfig(*authConfigPath)
err := loadAuthConfig()
if err != nil {
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
logger.Fatalf("cannot load auth config: %s", err)
}
authConfig.Store(m)
stopCh = make(chan struct{})
authConfigWG.Add(1)
go func() {
@@ -305,53 +310,90 @@ func stopAuthConfig() {
}
func authConfigReloader(sighupCh <-chan os.Signal) {
var refreshCh <-chan time.Time
// initialize auth refresh interval
if *configCheckInterval > 0 {
ticker := time.NewTicker(*configCheckInterval)
defer ticker.Stop()
refreshCh = ticker.C
}
for {
select {
case <-stopCh:
return
case <-refreshCh:
procutil.SelfSIGHUP()
case <-sighupCh:
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
m, err := readAuthConfig(*authConfigPath)
err := loadAuthConfig()
if err != nil {
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
continue
}
authConfig.Store(m)
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
}
}
}
var authConfig atomic.Value
var authConfig atomic.Pointer[AuthConfig]
var authUsers atomic.Pointer[map[string]*UserInfo]
var authConfigWG sync.WaitGroup
var stopCh chan struct{}
func readAuthConfig(path string) (map[string]*UserInfo, error) {
func loadAuthConfig() error {
ac, err := readAuthConfig(*authConfigPath)
if err != nil {
return fmt.Errorf("failed to load -auth.config=%q: %s", *authConfigPath, err)
}
m, err := parseAuthConfigUsers(ac)
if err != nil {
return fmt.Errorf("failed to parse users from -auth.config=%q: %s", *authConfigPath, err)
}
logger.Infof("loaded information about %d users from -auth.config=%q", len(m), *authConfigPath)
authConfig.Store(ac)
authUsers.Store(&m)
return nil
}
func readAuthConfig(path string) (*AuthConfig, error) {
data, err := fs.ReadFileOrHTTP(path)
if err != nil {
return nil, err
}
m, err := parseAuthConfig(data)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
}
logger.Infof("Loaded information about %d users from %q", len(m), path)
return m, nil
return parseAuthConfig(data)
}
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
var err error
data, err = envtemplate.ReplaceBytes(data)
func parseAuthConfig(data []byte) (*AuthConfig, error) {
data, err := envtemplate.ReplaceBytes(data)
if err != nil {
return nil, fmt.Errorf("cannot expand environment vars: %w", err)
}
var ac AuthConfig
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
if err = yaml.UnmarshalStrict(data, &ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
}
ui := ac.UnauthorizedUser
if ui != nil {
ui.requests = metrics.GetOrCreateCounter(`vmauth_unauthorized_user_requests_total`)
ui.concurrencyLimitCh = make(chan struct{}, ui.getMaxConcurrentRequests())
ui.concurrencyLimitReached = metrics.GetOrCreateCounter(`vmauth_unauthorized_user_concurrent_requests_limit_reached_total`)
_ = metrics.GetOrCreateGauge(`vmauth_unauthorized_user_concurrent_requests_capacity`, func() float64 {
return float64(cap(ui.concurrencyLimitCh))
})
_ = metrics.GetOrCreateGauge(`vmauth_unauthorized_user_concurrent_requests_current`, func() float64 {
return float64(len(ui.concurrencyLimitCh))
})
}
return &ac, nil
}
func parseAuthConfigUsers(ac *AuthConfig) (map[string]*UserInfo, error) {
uis := ac.Users
if len(uis) == 0 {
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
if len(uis) == 0 && ac.UnauthorizedUser == nil {
return nil, fmt.Errorf("Missing `users` or `unauthorized_user` sections")
}
byAuthToken := make(map[string]*UserInfo, len(uis))
for i := range uis {
@@ -374,6 +416,11 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
return nil, err
}
}
if ui.DefaultURL != nil {
if err := ui.DefaultURL.sanitize(); err != nil {
return nil, err
}
}
for _, e := range ui.URLMaps {
if len(e.SrcPaths) == 0 {
return nil, fmt.Errorf("missing `src_paths` in `url_map`")

View File

@@ -13,7 +13,11 @@ import (
func TestParseAuthConfigFailure(t *testing.T) {
f := func(s string) {
t.Helper()
_, err := parseAuthConfig([]byte(s))
ac, err := parseAuthConfig([]byte(s))
if err != nil {
return
}
_, err = parseAuthConfigUsers(ac)
if err == nil {
t.Fatalf("expecting non-nil error")
}
@@ -202,7 +206,11 @@ users:
func TestParseAuthConfigSuccess(t *testing.T) {
f := func(s string, expectedAuthConfig map[string]*UserInfo) {
t.Helper()
m, err := parseAuthConfig([]byte(s))
ac, err := parseAuthConfig([]byte(s))
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
m, err := parseAuthConfigUsers(ac)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
@@ -352,6 +360,83 @@ users:
URLPrefix: mustParseURL("https://bar/x"),
},
})
// with default url
f(`
users:
- bearer_token: foo
url_map:
- src_paths: ["/api/v1/query","/api/v1/query_range","/api/v1/label/[^./]+/.+"]
url_prefix: http://vmselect/select/0/prometheus
- src_paths: ["/api/v1/write"]
url_prefix: ["http://vminsert1/insert/0/prometheus","http://vminsert2/insert/0/prometheus"]
headers:
- "foo: bar"
- "xxx: y"
default_url:
- http://default1/select/0/prometheus
- http://default2/select/0/prometheus
`, map[string]*UserInfo{
getAuthToken("foo", "", ""): {
BearerToken: "foo",
URLMaps: []URLMap{
{
SrcPaths: getSrcPaths([]string{"/api/v1/query", "/api/v1/query_range", "/api/v1/label/[^./]+/.+"}),
URLPrefix: mustParseURL("http://vmselect/select/0/prometheus"),
},
{
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
URLPrefix: mustParseURLs([]string{
"http://vminsert1/insert/0/prometheus",
"http://vminsert2/insert/0/prometheus",
}),
Headers: []Header{
{
Name: "foo",
Value: "bar",
},
{
Name: "xxx",
Value: "y",
},
},
},
},
DefaultURL: mustParseURLs([]string{
"http://default1/select/0/prometheus",
"http://default2/select/0/prometheus",
}),
},
getAuthToken("", "foo", ""): {
BearerToken: "foo",
URLMaps: []URLMap{
{
SrcPaths: getSrcPaths([]string{"/api/v1/query", "/api/v1/query_range", "/api/v1/label/[^./]+/.+"}),
URLPrefix: mustParseURL("http://vmselect/select/0/prometheus"),
},
{
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
URLPrefix: mustParseURLs([]string{
"http://vminsert1/insert/0/prometheus",
"http://vminsert2/insert/0/prometheus",
}),
Headers: []Header{
{
Name: "foo",
Value: "bar",
},
{
Name: "xxx",
Value: "y",
},
},
},
},
DefaultURL: mustParseURLs([]string{
"http://default1/select/0/prometheus",
"http://default2/select/0/prometheus",
}),
},
})
}

View File

@@ -60,14 +60,22 @@ users:
- "http://vminsert2:8480/insert/42/prometheus"
# A single user for querying and inserting data:
#
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/42/prometheus
# - http://vmselect2:8481/select/42/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
#
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write .
# The "X-Scope-OrgID: abc" http header is added to these requests.
#
# Request which do not match `src_paths` from the `url_map` are proxied to the urls from `default_url`
# in a round-robin manner. The original request path is passed in `request_path` query arg.
# For example, request to http://vmauth:8427/non/existing/path are proxied:
# - to http://default1:8888/unsupported_url_handler?request_path=/non/existing/path
# - or http://default2:8888/unsupported_url_handler?request_path=/non/existing/path
- username: "foobar"
url_map:
- src_paths:
@@ -81,3 +89,25 @@ users:
url_prefix: "http://vminsert:8480/insert/42/prometheus"
headers:
- "X-Scope-OrgID: abc"
ip_filters:
deny_list: [127.0.0.1]
default_url:
- "http://default1:8888/unsupported_url_handler"
- "http://default2:8888/unsupported_url_handler"
# Requests without Authorization header are routed according to `unauthorized_user` section.
unauthorized_user:
url_map:
- src_paths:
- /api/v1/query
- /api/v1/query_range
url_prefix:
- http://vmselect1:8481/select/0/prometheus
- http://vmselect2:8481/select/0/prometheus
ip_filters:
allow_list: [8.8.8.8]
ip_filters:
allow_list: ["1.2.3.0/24", "127.0.0.1"]
deny_list:
- 10.1.0.1

View File

@@ -84,6 +84,13 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
authToken := r.Header.Get("Authorization")
if authToken == "" {
// Process requests for unauthorized users
ui := authConfig.Load().UnauthorizedUser
if ui != nil {
processUserRequest(w, r, ui)
return true
}
w.Header().Set("WWW-Authenticate", `Basic realm="Restricted"`)
http.Error(w, "missing `Authorization` request header", http.StatusUnauthorized)
return true
@@ -94,22 +101,28 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
authToken = strings.Replace(authToken, "Token", "Bearer", 1)
}
ac := authConfig.Load().(map[string]*UserInfo)
ac := *authUsers.Load()
ui := ac[authToken]
if ui == nil {
invalidAuthTokenRequests.Inc()
err := fmt.Errorf("cannot find the provided auth token %q in config", authToken)
if *logInvalidAuthTokens {
err := fmt.Errorf("cannot find the provided auth token %q in config", authToken)
err = &httpserver.ErrorWithStatusCode{
Err: err,
StatusCode: http.StatusUnauthorized,
}
httpserver.Errorf(w, r, "%s", err)
} else {
http.Error(w, err.Error(), http.StatusUnauthorized)
http.Error(w, "Unauthorized", http.StatusUnauthorized)
}
return true
}
processUserRequest(w, r, ui)
return true
}
func processUserRequest(w http.ResponseWriter, r *http.Request, ui *UserInfo) {
ui.requests.Inc()
// Limit the concurrency of requests to backends
@@ -119,31 +132,48 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if err := ui.beginConcurrencyLimit(); err != nil {
handleConcurrencyLimitError(w, r, err)
<-concurrencyLimitCh
return true
return
}
default:
concurrentRequestsLimitReached.Inc()
err := fmt.Errorf("cannot serve more than -maxConcurrentRequests=%d concurrent requests", cap(concurrencyLimitCh))
handleConcurrencyLimitError(w, r, err)
return true
return
}
processRequest(w, r, ui)
ui.endConcurrencyLimit()
<-concurrencyLimitCh
return true
}
func processRequest(w http.ResponseWriter, r *http.Request, ui *UserInfo) {
u := normalizeURL(r.URL)
up, headers, err := ui.getURLPrefixAndHeaders(u)
if err != nil {
httpserver.Errorf(w, r, "cannot determine targetURL: %s", err)
return
up, headers := ui.getURLPrefixAndHeaders(u)
isDefault := false
if up == nil {
missingRouteRequests.Inc()
if ui.DefaultURL == nil {
httpserver.Errorf(w, r, "missing route for %q", u.String())
return
}
up, headers = ui.DefaultURL, ui.Headers
isDefault = true
}
r.Body = &readTrackingBody{
r: r.Body,
}
maxAttempts := up.getBackendsCount()
for i := 0; i < maxAttempts; i++ {
bu := up.getLeastLoadedBackendURL()
targetURL := mergeURLs(bu.url, u)
targetURL := bu.url
// Don't change path and add request_path query param for default route.
if isDefault {
query := targetURL.Query()
query.Set("request_path", u.Path)
targetURL.RawQuery = query.Encode()
} else { // Update path for regular routes.
targetURL = mergeURLs(targetURL, u)
}
ok := tryProcessingRequest(w, r, targetURL, headers)
bu.put()
if ok {
@@ -151,7 +181,7 @@ func processRequest(w http.ResponseWriter, r *http.Request, ui *UserInfo) {
}
bu.setBroken()
}
err = &httpserver.ErrorWithStatusCode{
err := &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("all the backends for the user %q are unavailable", ui.name()),
StatusCode: http.StatusServiceUnavailable,
}
@@ -168,11 +198,10 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url
transportOnce.Do(transportInit)
res, err := transport.RoundTrip(req)
if err != nil {
remoteAddr := httpserver.GetQuotedRemoteAddr(r)
requestURI := httpserver.GetRequestURI(r)
if r.Method == http.MethodPost || r.Method == http.MethodPut {
// It is impossible to retry POST and PUT requests,
// since we already proxied the request body to the backend.
rtb := req.Body.(*readTrackingBody)
if rtb.readStarted {
// Request body has been already read, so it is impossible to retry the request.
// Return the error to the client then.
err = &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("cannot proxy the request to %q: %w", targetURL, err),
StatusCode: http.StatusServiceUnavailable,
@@ -180,7 +209,11 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url
httpserver.Errorf(w, r, "%s", err)
return true
}
logger.Warnf("remoteAddr: %s; requestURI: %s; error when proxying the request to %q: %s", remoteAddr, requestURI, targetURL, err)
// Retry the request if its body wasn't read yet. This usually means that the backend isn't reachable.
remoteAddr := httpserver.GetQuotedRemoteAddr(r)
// NOTE: do not use httpserver.GetRequestURI
// it explicitly reads request body and fails retries.
logger.Warnf("remoteAddr: %s; requestURI: %s; error when proxying the request to %q: %s", remoteAddr, req.URL, targetURL, err)
return false
}
removeHopHeaders(res.Header)
@@ -191,7 +224,6 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url
copyBuf.B = bytesutil.ResizeNoCopyNoOverallocate(copyBuf.B, 16*1024)
_, err = io.CopyBuffer(w, res.Body, copyBuf.B)
copyBufPool.Put(copyBuf)
_ = res.Body.Close()
if err != nil && !netutil.IsTrivialNetworkError(err) {
remoteAddr := httpserver.GetQuotedRemoteAddr(r)
requestURI := httpserver.GetRequestURI(r)
@@ -323,3 +355,28 @@ func handleConcurrencyLimitError(w http.ResponseWriter, r *http.Request, err err
}
httpserver.Errorf(w, r, "%s", err)
}
type readTrackingBody struct {
r io.ReadCloser
readStarted bool
}
// Read implements io.Reader interface
// tracks body reading requests
func (rtb *readTrackingBody) Read(p []byte) (int, error) {
if len(p) > 0 {
rtb.readStarted = true
}
return rtb.r.Read(p)
}
// Close implements io.Closer interface.
func (rtb *readTrackingBody) Close() error {
// Close rtb.r only if at least a single Read call was performed.
// http.Roundtrip performs body.Close call even without any Read calls
// so this hack allows us to reuse request body
if rtb.readStarted {
return rtb.r.Close()
}
return nil
}

View File

@@ -1,7 +1,6 @@
package main
import (
"fmt"
"net/url"
"path"
"strings"
@@ -30,19 +29,18 @@ func mergeURLs(uiURL, requestURI *url.URL) *url.URL {
return &targetURL
}
func (ui *UserInfo) getURLPrefixAndHeaders(u *url.URL) (*URLPrefix, []Header, error) {
func (ui *UserInfo) getURLPrefixAndHeaders(u *url.URL) (*URLPrefix, []Header) {
for _, e := range ui.URLMaps {
for _, sp := range e.SrcPaths {
if sp.match(u.Path) {
return e.URLPrefix, e.Headers, nil
return e.URLPrefix, e.Headers
}
}
}
if ui.URLPrefix != nil {
return ui.URLPrefix, ui.Headers, nil
return ui.URLPrefix, ui.Headers
}
missingRouteRequests.Inc()
return nil, nil, fmt.Errorf("missing route for %q", u.String())
return nil, nil
}
func normalizeURL(uOrig *url.URL) *url.URL {

View File

@@ -14,9 +14,9 @@ func TestCreateTargetURLSuccess(t *testing.T) {
t.Fatalf("cannot parse %q: %s", requestURI, err)
}
u = normalizeURL(u)
up, headers, err := ui.getURLPrefixAndHeaders(u)
if err != nil {
t.Fatalf("unexpected error: %s", err)
up, headers := ui.getURLPrefixAndHeaders(u)
if up == nil {
t.Fatalf("cannot determie backend: %s", err)
}
bu := up.getLeastLoadedBackendURL()
target := mergeURLs(bu.url, u)
@@ -124,10 +124,7 @@ func TestCreateTargetURLFailure(t *testing.T) {
t.Fatalf("cannot parse %q: %s", requestURI, err)
}
u = normalizeURL(u)
up, headers, err := ui.getURLPrefixAndHeaders(u)
if err == nil {
t.Fatalf("expecting non-nil error")
}
up, headers := ui.getURLPrefixAndHeaders(u)
if up != nil {
t.Fatalf("unexpected non-empty up=%#v", up)
}

View File

@@ -39,6 +39,9 @@ vmbackup-freebsd-amd64-prod:
vmbackup-openbsd-amd64-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-openbsd-amd64
vmbackup-windows-amd64-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-windows-amd64
package-vmbackup:
APP_NAME=vmbackup $(MAKE) package-via-docker
@@ -93,5 +96,8 @@ vmbackup-freebsd-amd64:
vmbackup-openbsd-amd64:
APP_NAME=vmbackup CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmbackup-windows-amd64:
GOARCH=amd64 APP_NAME=vmbackup $(MAKE) app-local-windows-goarch
vmbackup-pure:
APP_NAME=vmbackup $(MAKE) app-local-pure

View File

@@ -82,7 +82,7 @@ The command will upload only changed data to `gs://<bucket>/latest`.
Where `<daily-snapshot>` is the snapshot for the last day `<YYYYMMDD>`.
This apporach saves network bandwidth costs on hourly backups (since they are incremental) and allows recovering data from either the last hour (`latest` backup)
This approach saves network bandwidth costs on hourly backups (since they are incremental) and allows recovering data from either the last hour (`latest` backup)
or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when creating daily backup.
Do not forget to remove old backups when they are no longer needed in order to save storage costs.
@@ -103,7 +103,7 @@ The backup algorithm is the following:
7. Delete the created snapshot.
The algorithm splits source files into 1 GiB chunks in the backup. Each chunk is stored as a separate file in the backup.
Such splitting balances between the number of files in the backup and the amounts of data that needs to be re-transfered after temporary errors.
Such splitting balances between the number of files in the backup and the amounts of data that needs to be re-transferred after temporary errors.
`vmbackup` relies on [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282) properties:
@@ -190,7 +190,7 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
Where to put the backup on the remote storage. Example: gs://bucket/path/to/backup, s3://bucket/path/to/backup, azblob://container/path/to/backup or fs:///path/to/local/backup/dir
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -200,11 +200,11 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -255,7 +255,7 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)

View File

@@ -49,6 +49,12 @@ func main() {
logger.Init()
pushmetrics.Init()
// Storing snapshot delete function to be able to call it in case
// of error since logger.Fatal will exit the program without
// calling deferred functions.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2055
deleteSnapshot := func() {}
if len(*snapshotCreateURL) > 0 {
// create net/url object
createURL, err := url.Parse(*snapshotCreateURL)
@@ -80,32 +86,48 @@ func main() {
logger.Fatalf("cannot set snapshotName flag: %v", err)
}
defer func() {
deleteSnapshot = func() {
err := snapshot.Delete(deleteURL.String(), name)
if err != nil {
logger.Fatalf("cannot delete snapshot: %s", err)
}
}()
}
} else if len(*snapshotName) == 0 {
logger.Fatalf("`-snapshotName` or `-snapshot.createURL` must be provided")
}
if err := snapshot.Validate(*snapshotName); err != nil {
logger.Fatalf("invalid -snapshotName=%q: %s", *snapshotName, err)
}
go httpserver.Serve(*httpListenAddr, false, nil)
err := makeBackup()
deleteSnapshot()
if err != nil {
logger.Fatalf("cannot create backup: %s", err)
}
startTime := time.Now()
logger.Infof("gracefully shutting down http server for metrics at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop http server for metrics: %s", err)
}
logger.Infof("successfully shut down http server for metrics in %.3f seconds", time.Since(startTime).Seconds())
}
func makeBackup() error {
if err := snapshot.Validate(*snapshotName); err != nil {
return fmt.Errorf("invalid -snapshotName=%q: %s", *snapshotName, err)
}
srcFS, err := newSrcFS()
if err != nil {
logger.Fatalf("%s", err)
return err
}
dstFS, err := newDstFS()
if err != nil {
logger.Fatalf("%s", err)
return err
}
originFS, err := newOriginFS()
if err != nil {
logger.Fatalf("%s", err)
return err
}
a := &actions.Backup{
Concurrency: *concurrency,
@@ -114,18 +136,12 @@ func main() {
Origin: originFS,
}
if err := a.Run(); err != nil {
logger.Fatalf("cannot create backup: %s", err)
return err
}
srcFS.MustStop()
dstFS.MustStop()
originFS.MustStop()
startTime := time.Now()
logger.Infof("gracefully shutting down http server for metrics at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop http server for metrics: %s", err)
}
logger.Infof("successfully shut down http server for metrics in %.3f seconds", time.Since(startTime).Seconds())
return nil
}
func usage() {
@@ -139,7 +155,7 @@ See the docs at https://docs.victoriametrics.com/vmbackup.html .
}
func newSrcFS() (*fslocal.FS, error) {
snapshotPath := *storageDataPath + "/snapshots/" + *snapshotName
snapshotPath := filepath.Join(*storageDataPath, "snapshots", *snapshotName)
// Verify the snapshot exists.
f, err := os.Open(snapshotPath)

View File

@@ -1,4 +1,4 @@
## vmbackupmanager
# vmbackupmanager
***vmbackupmanager is a part of [enterprise package](https://docs.victoriametrics.com/enterprise.html). It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)***
@@ -48,7 +48,7 @@ Backup manager uploads only the data that has been changed or created since the
This reduces the consumed network traffic and the time needed for performing the backup.
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for details.
*Please take into account that the first backup upload could take a significant amount of time as it needs to upload all of the data.*
*Please take into account that the first backup upload could take a significant amount of time as it needs to upload all the data.*
There are two flags which could help with performance tuning:
@@ -104,11 +104,11 @@ The result on the GCS bucket
* The root folder
![root](vmbackupmanager_root_folder.png)
<img alt="root folder" src="vmbackupmanager_root_folder.png">
* The latest folder
![latest](vmbackupmanager_latest_folder.png)
<img alt="latest folder" src="vmbackupmanager_latest_folder.png">
## Backup Retention Policy
@@ -123,7 +123,7 @@ Backup retention policy is controlled by:
Lets assume we have a backup manager collecting daily backups for the past 10 days.
![daily](vmbackupmanager_rp_daily_1.png)
<img alt="retention policy daily before retention cycle" src="vmbackupmanager_rp_daily_1.png">
We enable backup retention policy for backup manager by using following configuration:
@@ -148,8 +148,32 @@ info app/vmbackupmanager/retention.go:106 daily backups to delete [daily/2
The result on the GCS bucket. We see only 3 daily backups:
![daily](vmbackupmanager_rp_daily_2.png)
<img alt="retention policy daily after retention cycle" src="vmbackupmanager_rp_daily_2.png">
### Protection backups against deletion by retention policy
You can protect any backup against deletion by retention policy with the `vmbackupmanager backups lock` command.
For instance:
```console
./vmbackupmanager backup lock daily/2021-02-13 -dst=<DST_PATH> -storageDataPath=/vmstorage-data -eula
```
After that the backup won't be deleted by retention policy.
You can view the `locked` attribute in backup list:
```console
./vmbackupmanager backup list -dst=<DST_PATH> -storageDataPath=/vmstorage-data -eula
```
To remove protection, you can use the command `vmbackupmanager backups unlock`.
For example:
```console
./vmbackupmanager backup unlock daily/2021-02-13 -dst=<DST_PATH> -storageDataPath=/vmstorage-data -eula
```
## API methods
@@ -158,7 +182,24 @@ The result on the GCS bucket. We see only 3 daily backups:
* GET `/api/v1/backups` - returns list of backups in remote storage.
Example output:
```json
[{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}]
[{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00"},{"name":"hourly/2023-04-07:11","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:06+00:00"},{"name":"latest","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:04+00:00"},{"name":"monthly/2023-04","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:10+00:00"},{"name":"weekly/2023-14","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:09+00:00"}]
```
> Note: `created_at` field is in RFC3339 format.
* GET `/api/v1/backups/<BACKUP_NAME>` - returns backup info by name.
Example output:
```json
{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00","locked":true}
```
* PUT `/api/v1/backups/<BACKUP_NAME>` - update "locked" attribute for backup by name.
Example request body:
```json
{"locked":true}
```
Example response:
```json
{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00", "locked": true}
```
* POST `/api/v1/restore` - saves backup name to restore when [performing restore](#restore-commands).
@@ -186,7 +227,13 @@ vmbackupmanager backup
vmbackupmanager backup list
List backups in remote storage
vmbackupmanager restore
vmbackupmanager backup lock
Locks backup in remote storage against deletion
vmbackupmanager backup unlock
Unlocks backup in remote storage for deletion
vmbackupmanager restore
Restore backup specified by restore mark if it exists
vmbackupmanager restore get
@@ -211,7 +258,7 @@ It can be changed by using flag:
`vmbackupmanager backup list` lists backups in remote storage:
```console
$ ./vmbackupmanager backup list
[{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}]
[{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00"},{"name":"hourly/2023-04-07:11","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:06+00:00"},{"name":"latest","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:04+00:00"},{"name":"monthly/2023-04","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:10+00:00"},{"name":"weekly/2023-14","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:09+00:00"}]
```
### Restore commands
@@ -249,16 +296,16 @@ If restore mark doesn't exist at `storageDataPath`(restore wasn't requested) `vm
1. Run `vmbackupmanager backup list` to get list of available backups:
```console
$ /vmbackupmanager-prod backup list
["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"]
[{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00"},{"name":"hourly/2023-04-07:11","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:06+00:00"},{"name":"latest","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:04+00:00"},{"name":"monthly/2023-04","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:10+00:00"},{"name":"weekly/2023-14","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:09+00:00"}]
```
2. Run `vmbackupmanager restore create` to create restore mark:
- Use relative path to backup to restore from currently used remote storage:
```console
$ /vmbackupmanager-prod restore create daily/2022-10-06
$ /vmbackupmanager-prod restore create daily/2023-04-07
```
- Use full path to backup to restore from any remote storage:
```console
$ /vmbackupmanager-prod restore create azblob://test1/vmbackupmanager/daily/2022-10-06
$ /vmbackupmanager-prod restore create azblob://test1/vmbackupmanager/daily/2023-04-07
```
3. Stop `vmstorage` or `vmsingle` node
4. Run `vmbackupmanager restore` to restore backup:
@@ -275,23 +322,24 @@ If restore mark doesn't exist at `storageDataPath`(restore wasn't requested) `vm
```yaml
vmbackup:
restore:
onStart: "true"
onStart:
enabled: "true"
```
See operator `VMStorage` schema [here](https://docs.victoriametrics.com/operator/api.html#vmstorage) and `VMSingle` [here](https://docs.victoriametrics.com/operator/api.html#vmsinglespec).
2. Enter container running `vmbackupmanager`
2. Use `vmbackupmanager backup list` to get list of available backups:
```console
$ /vmbackupmanager-prod backup list
["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"]
[{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00"},{"name":"hourly/2023-04-07:11","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:06+00:00"},{"name":"latest","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:04+00:00"},{"name":"monthly/2023-04","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:10+00:00"},{"name":"weekly/2023-14","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:09+00:00"}]
```
3. Use `vmbackupmanager restore create` to create restore mark:
- Use relative path to backup to restore from currently used remote storage:
```console
$ /vmbackupmanager-prod restore create daily/2022-10-06
$ /vmbackupmanager-prod restore create daily/2023-04-07
```
- Use full path to backup to restore from any remote storage:
```console
$ /vmbackupmanager-prod restore create azblob://test1/vmbackupmanager/daily/2022-10-06
$ /vmbackupmanager-prod restore create azblob://test1/vmbackupmanager/daily/2023-04-07
```
4. Restart pod
@@ -305,7 +353,8 @@ Clusters here are referred to as `source` and `destination`.
```yaml
vmbackup:
restore:
onStart: "true"
onStart:
enabled: "true"
```
Note: it is safe to leave this section in the cluster configuration, since it will be ignored if restore mark doesn't exist.
> Important! Use different `-dst` for *destination* cluster to avoid overwriting backup data of the *source* cluster.
@@ -313,13 +362,13 @@ Clusters here are referred to as `source` and `destination`.
2. Use `vmbackupmanager backup list` to get list of available backups:
```console
$ /vmbackupmanager-prod backup list
["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"]
[{"name":"daily/2023-04-07","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:07+00:00"},{"name":"hourly/2023-04-07:11","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:06+00:00"},{"name":"latest","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:04+00:00"},{"name":"monthly/2023-04","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:10+00:00"},{"name":"weekly/2023-14","size_bytes":318837,"size":"311.4ki","created_at":"2023-04-07T16:15:09+00:00"}]
```
3. Use `vmbackupmanager restore create` to create restore mark at each pod of the *destination* cluster.
Each pod in *destination* cluster should be restored from backup of respective pod in *source* cluster.
For example: `vmstorage-source-0` in *source* cluster should be restored from `vmstorage-destination-0` in *destination* cluster.
```console
$ /vmbackupmanager-prod restore create s3://source_cluster/vmstorage-source-0/daily/2022-10-06
$ /vmbackupmanager-prod restore create s3://source_cluster/vmstorage-source-0/daily/2023-04-07
```
## Monitoring
@@ -374,7 +423,7 @@ command-line flags:
-dst string
The root folder of Victoria Metrics backups. Example: gs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -384,11 +433,11 @@ command-line flags:
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -444,16 +493,16 @@ command-line flags:
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-runOnStart
Upload backups immediately after start of the service. Otherwise the backup starts on new hour
Upload backups immediately after start of the service. Otherwise, the backup starts on new hour
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup.Example: http://victoriametrics:8428/snapshot/create
-snapshot.deleteURL string
VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshot.createURL if not provided. All created snaphosts will be automatically deleted.Example: http://victoriametrics:8428/snapshot/delete
VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshot.createURL if not provided. All created snapshots will be automatically deleted.Example: http://victoriametrics:8428/snapshot/delete
-storageDataPath string
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
-tls

View File

@@ -93,7 +93,7 @@ OpenTSDB migration works like so:
- e.g. `curl -Ss "http://opentsdb:4242/api/search/lookup?m=system.load5&limit=1000000"`
Here `results` return field should not be empty. Otherwise it means that meta tables are absent and needs to be turned on previously.
Here `results` return field should not be empty. Otherwise, it means that meta tables are absent and needs to be turned on previously.
3. Download data for each series in chunks defined in the CLI switches
@@ -146,7 +146,7 @@ Retention strings essentially define the two levels of aggregation for our colle
First-order aggregation addresses how to aggregate any un-mentioned tags.
This is, conceptually, directly opposite to how PromQL deals with tags. In OpenTSDB, if a tag isn't explicitly mentioned, all values assocaited with that tag will be aggregated.
This is, conceptually, directly opposite to how PromQL deals with tags. In OpenTSDB, if a tag isn't explicitly mentioned, all values associated with that tag will be aggregated.
It is recommended to use `sum` for the first aggregation because it is relatively quick and should not cause any changes to the incoming data (because we collect each individual series).
@@ -154,9 +154,9 @@ It is recommended to use `sum` for the first aggregation because it is relativel
Second-order aggregation (`1m-avg` in our example) defines any windowing that should occur before returning the data
It is recommended to match the stat collection interval so we again avoid transforming incoming data.
It is recommended to match the stat collection interval, so we again avoid transforming incoming data.
We do not allow for defining the "null value" portion of the rollup window (e.g. in the aggreagtion, `1m-avg-none`, the user cannot change `none`), as the goal of this tool is to avoid modifying incoming data.
We do not allow for defining the "null value" portion of the rollup window (e.g. in the aggregation, `1m-avg-none`, the user cannot change `none`), as the goal of this tool is to avoid modifying incoming data.
#### Windows
@@ -173,9 +173,9 @@ The window `1h` means that each individual query to OpenTSDB should only span 1
It is important to ensure this window somewhat matches the row size in HBase to help improve query times.
For example, if the query is hitting a rollup table with a 4 hour row size, we should set a chunk size of a multiple of 4 hours (e.g. `4h`, `8h`, etc.) to avoid requesting data across row boundaries. Landing on row boundaries allows for more consistent request times to HBase.
For example, if the query is hitting a rollup table with a 4-hour row size, we should set a chunk size of a multiple of 4 hours (e.g. `4h`, `8h`, etc.) to avoid requesting data across row boundaries. Landing on row boundaries allows for more consistent request times to HBase.
The default table created in HBase for OpenTSDB has a 1 hour row size, so if you aren't sure on a correct row size to use, `1h` is a reasonable choice.
The default table created in HBase for OpenTSDB has a 1-hour row size, so if you aren't sure on a correct row size to use, `1h` is a reasonable choice.
##### Time range
@@ -197,7 +197,7 @@ Chunking the data like this means each individual query returns faster, so we ca
### Restarting OpenTSDB migrations
One important note for OpenTSDB migration: Queries/HBase scans can "get stuck" within OpenTSDB itself. This can cause instability and performance issues within an OpenTSDB cluster, so stopping the migrator to deal with it may be necessary. Because of this, we provide the timstamp we started collecting data from at thebeginning of the run. You can stop and restart the importer using this "hard timestamp" to ensure you collect data from the same time range over multiple runs.
One important note for OpenTSDB migration: Queries/HBase scans can "get stuck" within OpenTSDB itself. This can cause instability and performance issues within an OpenTSDB cluster, so stopping the migrator to deal with it may be necessary. Because of this, we provide the timestamp we started collecting data from at the beginning of the run. You can stop and restart the importer using this "hard timestamp" to ensure you collect data from the same time range over multiple runs.
## Migrating data from InfluxDB (1.x)
@@ -376,7 +376,7 @@ The configuration flags should contain self-explanatory descriptions.
The filtering consists of three parts: by timeseries and time.
Filtering by time may be configured via flags `--prom-filter-time-start` and `--prom-filter-time-end`
in in RFC3339 format. This filter applied twice: to drop blocks out of range and to filter timeseries in blocks with
in RFC3339 format. This filter applied twice: to drop blocks out of range and to filter timeseries in blocks with
overlapping time range.
Example of applying time filter:
@@ -403,7 +403,7 @@ since this is heavy operation and will be done during import process.
Filtering by timeseries is configured with following flags:
- `--prom-filter-label` - the label name, e.g. `__name__` or `instance`;
- `--prom-filter-label-value` - the regular expression to filter the label value. By default matches all `.*`
- `--prom-filter-label-value` - the regular expression to filter the label value. By default, matches all `.*`
For example:
@@ -659,7 +659,7 @@ requires an Authentication header like `X-Scope-OrgID`. You can define it via th
## Migrating data from Mimir
Mimir has similar implemintation as Cortex and also support of the Prometheus remote read protocol. That means
Mimir has similar implementation as Cortex and also support of the Prometheus remote read protocol. That means
`vmctl` in mode `remote-read` may also be used for Mimir historical data migration.
These instructions may vary based on the details of your Mimir configuration.
Please read carefully and verify as you go.
@@ -727,35 +727,32 @@ requires an Authentication header like `X-Scope-OrgID`. You can define it via th
## Migrating data from VictoriaMetrics
### Native protocol
The [native binary protocol](https://docs.victoriametrics.com/#how-to-export-data-in-native-format)
was introduced in [1.42.0 release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.42.0)
and provides the most efficient way to migrate data between VM instances: single to single, cluster to cluster,
single to cluster and vice versa. Please note that both instances (source and destination) should be of v1.42.0
or higher.
vmctl uses [native binary protocol](https://docs.victoriametrics.com/#how-to-export-data-in-native-format)
(available since [1.42.0 release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.42.0))
o migrate data between VM instances: single to single, cluster to cluster, single to cluster and vice versa.
See `./vmctl vm-native --help` for details and full list of flags.
Migration in `vm-native` mode takes two steps:
1. Explore the list of the metrics to migrate via `/api/v1/series` API;
1. Explore the list of the metrics to migrate via `api/v1/label/__name__/values` API;
2. Migrate explored metrics one-by-one.
```
./vmctl vm-native \
--vm-native-src-addr=http://127.0.0.1:8481/select/0/prometheus \
--vm-native-dst-addr=http://localhost:8428 \
--vm-native-filter-time-start='2022-11-20T00:00:00Z' \
--vm-native-filter-match='{__name__=~"vm_cache_.*"}'
--vm-native-src-addr=http://127.0.0.1:8481/select/0/prometheus \ # migrate from
--vm-native-dst-addr=http://localhost:8428 \ # migrate to
--vm-native-filter-time-start='2022-11-20T00:00:00Z' \ # starting from
--vm-native-filter-match='{__name__=~"vm_cache_.*"}' # only metrics matching the selector
VictoriaMetrics Native import mode
2023/03/02 09:22:02 Initing import process from "http://127.0.0.1:8481/select/0/prometheus/api/v1/export/native" to "http://localhost:8428/api/v1/import/native" with filter
2023/03/02 09:22:02 Initing import process from "http://127.0.0.1:8481/select/0/prometheus/api/v1/export/native"
to "http://localhost:8428/api/v1/import/native" with filter
filter: match[]={__name__=~"vm_cache_.*"}
start: 2022-11-20T00:00:00Z
2023/03/02 09:22:02 Exploring metrics...
Found 9 metrics to import. Continue? [Y/n]
2023/03/02 09:22:04 Requests to make: 9
Requests to make: 9 / 9 [███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████] 100.00%
Requests to make: 9 / 9 [█████████████████████████████████████████████████████████████████████████████] 100.00%
2023/03/02 09:22:06 Import finished!
2023/03/02 09:22:06 VictoriaMetrics importer stats:
time spent while importing: 3.632638875s;
@@ -766,39 +763,58 @@ Requests to make: 9 / 9 [██████████████████
2023/03/02 09:22:06 Total time: 3.633127625s
```
_To disable explore phase and switch to the old way of data migration via single connection use
`--vm-native-disable-retries` cmd-line flag. Please note, in this mode vmctl won't be able to retry failed requests._
Importing tips:
1. Migrating big volumes of data may result in reaching the safety limits on `src` side.
Please verify that `-search.maxExportDuration` and `-search.maxExportSeries` were set with
proper values for `src`. If hitting the limits, follow the recommendations [here](https://docs.victoriametrics.com/#how-to-export-data-in-native-format).
If hitting `the number of matching timeseries exceeds...` error, adjust filters to match less time series or update `-search.maxSeries` command-line flag on vmselect/vmsingle;
proper values for `src`. If hitting the limits, follow the recommendations
[here](https://docs.victoriametrics.com/#how-to-export-data-in-native-format).
If hitting `the number of matching timeseries exceeds...` error, adjust filters to match less time series or
update `-search.maxSeries` command-line flag on vmselect/vmsingle;
2. Migrating all the metrics from one VM to another may collide with existing application metrics
(prefixed with `vm_`) at destination and lead to confusion when using
[official Grafana dashboards](https://grafana.com/orgs/victoriametrics/dashboards).
To avoid such situation try to filter out VM process metrics via `--vm-native-filter-match` flag.
3. Migration is a backfilling process, so it is recommended to read
To avoid such situation try to filter out VM process metrics via `--vm-native-filter-match='{__name__!~"vm_.*"}'` flag.
3. Migrating data with overlapping time range or via unstable network can produce duplicates series at destination.
To avoid duplicates set `-dedup.minScrapeInterval=1ms` for `vmselect`/`vmstorage` at the destination.
This will instruct `vmselect`/`vmstorage` to ignore duplicates with identical timestamps.
4. When migrating large volumes of data use `--vm-native-step-interval` flag to split migration [into steps](#using-time-based-chunking-of-migration).
5. When migrating data from one VM cluster to another, consider using [cluster-to-cluster mode](#cluster-to-cluster-migration-mode).
Or manually specify addresses according to [URL format](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format):
```console
# Migrating from cluster specific tenantID to single
--vm-native-src-addr=http://<src-vmselect>:8481/select/0/prometheus
--vm-native-dst-addr=http://<dst-vmsingle>:8428
# Migrating from single to cluster specific tenantID
--vm-native-src-addr=http://<src-vmsingle>:8428
--vm-native-src-addr=http://<dst-vminsert>:8480/insert/0/prometheus
# Migrating single to single
--vm-native-src-addr=http://<src-vmsingle>:8428
--vm-native-dst-addr=http://<dst-vmsingle>:8428
# Migrating cluster to cluster for specific tenant ID
--vm-native-src-addr=http://<src-vmselect>:8481/select/0/prometheus
--vm-native-dst-addr=http://<dst-vminsert>:8480/insert/0/prometheus
```
6. Migration speed can be adjusted via `--vm-concurrency` cmd-line flag, which controls the number of concurrent
workers busy with processing. Please note, that each worker can load up to a single vCPU core on VictoriaMetrics.
So try to set it according to allocated CPU resources of your VictoriaMetrics destination installation.
7. Migration is a backfilling process, so it is recommended to read
[Backfilling tips](https://github.com/VictoriaMetrics/VictoriaMetrics#backfilling) section.
4. `vmctl` doesn't provide relabeling or other types of labels management in this mode.
8. `vmctl` doesn't provide relabeling or other types of labels management.
Instead, use [relabeling in VictoriaMetrics](https://github.com/VictoriaMetrics/vmctl/issues/4#issuecomment-683424375).
5. When importing in or from cluster version remember to use correct [URL format](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format)
and specify `accountID` param.
6. When migrating large volumes of data it might be useful to use `--vm-native-step-interval` flag to split single process into smaller steps.
7. `vmctl` supports `--vm-concurrency` which controls the number of concurrent workers that process the input from source query results.
Please note that each import request can load up to a single vCPU core on VictoriaMetrics. So try to set it according
to allocated CPU resources of your VictoriaMetrics installation.
8. `vmctl` supports `--vm-native-src-headers` and `--vm-native-dst-headers` which defines headers to send with each request
9. `vmctl` supports `--vm-native-src-headers` and `--vm-native-dst-headers` to define headers sent with each request
to the corresponding source address.
9. `vmctl` supports `--vm-native-disable-http-keep-alive` to allow `vmctl` to use non-persistent HTTP connections to avoid
10. `vmctl` supports `--vm-native-disable-http-keep-alive` to allow `vmctl` to use non-persistent HTTP connections to avoid
error `use of closed network connection` when run a longer export.
10. Migrating data with overlapping time range for destination data can produce duplicates series at destination.
To avoid duplicates on the destination set `-dedup.minScrapeInterval=1ms` for `vmselect` and `vmstorage`.
This will instruct `vmselect` and `vmstorage` to ignore duplicates with match timestamps.
In this mode `vmctl` acts as a proxy between two VM instances, where time series filtering is done by "source" (`src`)
and processing is done by "destination" (`dst`). So no extra memory or CPU resources required on `vmctl` side. Only
`src` and `dst` resource matter.
#### Using time-based chunking of migration
### Using time-based chunking of migration
It is possible split migration process into set of smaller batches based on time. This is especially useful when
migrating large volumes of data as this adds indication of progress and ability to restore process from certain point
@@ -843,7 +859,7 @@ Requests to make: 45 / 45 [█████████████████
2023/03/02 09:18:12 Total time: 7.112405875s
```
#### Cluster-to-cluster migration mode
### Cluster-to-cluster migration mode
Using cluster-to-cluster migration mode helps to migrate all tenants data in a single `vmctl` run.
@@ -891,7 +907,9 @@ Requests to make for tenant 1:0: 28 / 28 [████████████
## Verifying exported blocks from VictoriaMetrics
In this mode, `vmctl` allows verifying correctness and integrity of data exported via [native format](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-export-data-in-native-format) from VictoriaMetrics.
In this mode, `vmctl` allows verifying correctness and integrity of data exported via
[native format](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-export-data-in-native-format)
from VictoriaMetrics.
You can verify exported data at disk before uploading it by `vmctl verify-block` command:
```console
@@ -926,7 +944,7 @@ to number of free CPU cores.
The flag `--vm-concurrency` controls the number of concurrent workers that process the input from InfluxDB query results.
Please note that each import request can load up to a single vCPU core on VictoriaMetrics. So try to set it according
to allocated CPU resources of your VictoriMetrics installation.
to allocated CPU resources of your VictoriaMetrics installation.
The flag `--vm-batch-size` controls max amount of samples collected before sending the import request.
For example, if `--influx-chunk-size=500` and `--vm-batch-size=2000` then importer will process not more
@@ -971,7 +989,7 @@ according to [information theory](https://en.wikipedia.org/wiki/Information_theo
`vmctl` provides the following flags for improving data compression:
- `--vm-round-digits` flag for rounding processed values to the given number of decimal digits after the point.
For example, `--vm-round-digits=2` would round `1.2345` to `1.23`. By default the rounding is disabled.
For example, `--vm-round-digits=2` would round `1.2345` to `1.23`. By default, the rounding is disabled.
- `--vm-significant-figures` flag for limiting the number of significant figures in processed values. It takes no effect if set
to 0 (by default), but set `--vm-significant-figures=5` and `102.342305` will be rounded to `102.34`.
@@ -981,7 +999,7 @@ results such as `average`, `rate`, etc.
### Adding extra labels
`vmctl` allows to add extra labels to all imported series. It can be achived with flag `--vm-extra-label label=value`.
`vmctl` allows to add extra labels to all imported series. It can be achieved with flag `--vm-extra-label label=value`.
If multiple labels needs to be added, set flag for each label, for example, `--vm-extra-label label1=value1 --vm-extra-label label2=value2`.
If timeseries already have label, that must be added with `--vm-extra-label` flag, flag has priority and will override label value from timeseries.

View File

@@ -42,7 +42,6 @@ func New() *Backoff {
func (b *Backoff) Retry(ctx context.Context, cb retryableFunc) (uint64, error) {
var attempt uint64
for i := 0; i < b.retries; i++ {
// @TODO we should use context to cancel retries
err := cb()
if err == nil {
return attempt, nil
@@ -55,7 +54,19 @@ func (b *Backoff) Retry(ctx context.Context, cb retryableFunc) (uint64, error) {
backoff := float64(b.minDuration) * math.Pow(b.factor, float64(i))
dur := time.Duration(backoff)
logger.Errorf("got error: %s on attempt: %d; will retry in %v", err, attempt, dur)
time.Sleep(time.Duration(backoff))
t := time.NewTimer(dur)
select {
case <-t.C:
// duration elapsed, loop
case <-ctx.Done():
// context cancelled, kill the timer if it hasn't fired, and return
// the last error we got
if !t.Stop() {
<-t.C
}
return attempt, err
}
}
return attempt, fmt.Errorf("execution failed after %d retry attempts", b.retries)
}

View File

@@ -16,7 +16,7 @@ func TestRetry_Do(t *testing.T) {
backoffMinDuration time.Duration
retryableFunc retryableFunc
ctx context.Context
withCancel bool
cancelTimeout time.Duration
want uint64
wantErr bool
}{
@@ -79,10 +79,33 @@ func TestRetry_Do(t *testing.T) {
want: 5,
wantErr: true,
},
{
name: "cancel context",
backoffRetries: 5,
backoffFactor: 0.1,
backoffMinDuration: time.Millisecond * 10,
retryableFunc: func() error {
t := time.NewTicker(time.Millisecond * 5)
defer t.Stop()
for range t.C {
return fmt.Errorf("got some error")
}
return nil
},
ctx: context.Background(),
cancelTimeout: time.Second * 5,
want: 3,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := New()
if tt.cancelTimeout != 0 {
newCtx, cancelFn := context.WithTimeout(tt.ctx, tt.cancelTimeout)
tt.ctx = newCtx
defer cancelFn()
}
got, err := r.Retry(tt.ctx, tt.retryableFunc)
if (err != nil) != tt.wantErr {
t.Errorf("Retry() error = %v, wantErr %v", err, tt.wantErr)

View File

@@ -113,7 +113,7 @@ var (
&cli.Int64Flag{
Name: vmRateLimit,
Usage: "Optional data transfer rate limit in bytes per second.\n" +
"By default the rate limit is disabled. It can be useful for limiting load on configured via '--vmAddr' destination.",
"By default, the rate limit is disabled. It can be useful for limiting load on configured via '--vmAddr' destination.",
},
&cli.BoolFlag{
Name: vmDisableProgressBar,
@@ -326,6 +326,7 @@ const (
vmNativeStepInterval = "vm-native-step-interval"
vmNativeDisableHTTPKeepAlive = "vm-native-disable-http-keep-alive"
vmNativeDisableRetries = "vm-native-disable-retries"
vmNativeSrcAddr = "vm-native-src-addr"
vmNativeSrcUser = "vm-native-src-user"
@@ -351,16 +352,17 @@ var (
},
&cli.StringFlag{
Name: vmNativeFilterTimeStart,
Usage: "The time filter may contain either unix timestamp in seconds or RFC3339 values. E.g. '2020-01-01T20:07:00Z'",
Usage: "The time filter may contain different timestamp formats. See more details here https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#timestamp-formats",
Required: true,
},
&cli.StringFlag{
Name: vmNativeFilterTimeEnd,
Usage: "The time filter may contain either unix timestamp in seconds or RFC3339 values. E.g. '2020-01-01T20:07:00Z'",
Usage: "The time filter may contain different timestamp formats. See more details here https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#timestamp-formats",
},
&cli.StringFlag{
Name: vmNativeStepInterval,
Usage: fmt.Sprintf("Split export data into chunks. Requires setting --%s. Valid values are '%s','%s','%s','%s'.", vmNativeFilterTimeStart, stepper.StepMonth, stepper.StepDay, stepper.StepHour, stepper.StepMinute),
Value: stepper.StepMonth,
},
&cli.BoolFlag{
Name: vmNativeDisableHTTPKeepAlive,
@@ -430,7 +432,7 @@ var (
&cli.Int64Flag{
Name: vmRateLimit,
Usage: "Optional data transfer rate limit in bytes per second.\n" +
"By default the rate limit is disabled. It can be useful for limiting load on source or destination databases.",
"By default, the rate limit is disabled. It can be useful for limiting load on source or destination databases.",
},
&cli.BoolFlag{
Name: vmInterCluster,
@@ -443,6 +445,11 @@ var (
Usage: "Number of workers concurrently performing import requests to VM",
Value: 2,
},
&cli.BoolFlag{
Name: vmNativeDisableRetries,
Usage: "Defines whether to disable retries with backoff policy for migration process",
Value: false,
},
}
)
@@ -497,7 +504,7 @@ var (
},
&cli.BoolFlag{
Name: remoteReadUseStream,
Usage: "Defines whether to use SAMPLES or STREAMED_XOR_CHUNKS mode. By default is uses SAMPLES mode. See https://prometheus.io/docs/prometheus/latest/querying/remote_read_api/#streamed-chunks",
Usage: "Defines whether to use SAMPLES or STREAMED_XOR_CHUNKS mode. By default, is uses SAMPLES mode. See https://prometheus.io/docs/prometheus/latest/querying/remote_read_api/#streamed-chunks",
Value: false,
},
&cli.StringFlag{

View File

@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"strings"
@@ -15,6 +16,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/backoff"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/terminal"
"github.com/urfave/cli/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/influx"
@@ -71,7 +73,7 @@ func main() {
}
otsdbProcessor := newOtsdbProcessor(otsdbClient, importer, c.Int(otsdbConcurrency))
return otsdbProcessor.run(c.Bool(globalSilent), c.Bool(globalVerbose))
return otsdbProcessor.run(isNonInteractive(c), c.Bool(globalVerbose))
},
},
{
@@ -112,7 +114,7 @@ func main() {
c.String(influxMeasurementFieldSeparator),
c.Bool(influxSkipDatabaseLabel),
c.Bool(influxPrometheusMode))
return processor.run(c.Bool(globalSilent), c.Bool(globalVerbose))
return processor.run(isNonInteractive(c), c.Bool(globalVerbose))
},
},
{
@@ -152,7 +154,7 @@ func main() {
},
cc: c.Int(remoteReadConcurrency),
}
return rmp.run(ctx, c.Bool(globalSilent), c.Bool(globalVerbose))
return rmp.run(ctx, isNonInteractive(c), c.Bool(globalVerbose))
},
},
{
@@ -186,7 +188,7 @@ func main() {
im: importer,
cc: c.Int(promConcurrency),
}
return pp.run(c.Bool(globalSilent), c.Bool(globalVerbose))
return pp.run(isNonInteractive(c), c.Bool(globalVerbose))
},
},
{
@@ -200,6 +202,8 @@ func main() {
return fmt.Errorf("flag %q can't be empty", vmNativeFilterMatch)
}
disableKeepAlive := c.Bool(vmNativeDisableHTTPKeepAlive)
var srcExtraLabels []string
srcAddr := strings.Trim(c.String(vmNativeSrcAddr), "/")
srcAuthConfig, err := auth.Generate(
@@ -209,6 +213,7 @@ func main() {
if err != nil {
return fmt.Errorf("error initilize auth config for source: %s", srcAddr)
}
srcHTTPClient := &http.Client{Transport: &http.Transport{DisableKeepAlives: disableKeepAlive}}
dstAddr := strings.Trim(c.String(vmNativeDstAddr), "/")
dstExtraLabels := c.StringSlice(vmExtraLabel)
@@ -219,6 +224,7 @@ func main() {
if err != nil {
return fmt.Errorf("error initilize auth config for destination: %s", dstAddr)
}
dstHTTPClient := &http.Client{Transport: &http.Transport{DisableKeepAlives: disableKeepAlive}}
p := vmNativeProcessor{
rateLimit: c.Int64(vmRateLimit),
@@ -230,21 +236,22 @@ func main() {
Chunk: c.String(vmNativeStepInterval),
},
src: &native.Client{
AuthCfg: srcAuthConfig,
Addr: srcAddr,
ExtraLabels: srcExtraLabels,
DisableHTTPKeepAlive: c.Bool(vmNativeDisableHTTPKeepAlive),
AuthCfg: srcAuthConfig,
Addr: srcAddr,
ExtraLabels: srcExtraLabels,
HTTPClient: srcHTTPClient,
},
dst: &native.Client{
AuthCfg: dstAuthConfig,
Addr: dstAddr,
ExtraLabels: dstExtraLabels,
DisableHTTPKeepAlive: c.Bool(vmNativeDisableHTTPKeepAlive),
AuthCfg: dstAuthConfig,
Addr: dstAddr,
ExtraLabels: dstExtraLabels,
HTTPClient: dstHTTPClient,
},
backoff: backoff.New(),
cc: c.Int(vmConcurrency),
backoff: backoff.New(),
cc: c.Int(vmConcurrency),
disableRetries: c.Bool(vmNativeDisableRetries),
}
return p.run(ctx, c.Bool(globalSilent))
return p.run(ctx, isNonInteractive(c))
},
},
{
@@ -317,3 +324,8 @@ func initConfigVM(c *cli.Context) vm.Config {
DisableProgressBar: c.Bool(vmDisableProgressBar),
}
}
func isNonInteractive(c *cli.Context) bool {
isTerminal := terminal.IsTerminal(int(os.Stdout.Fd()))
return c.Bool(globalSilent) || !isTerminal
}

View File

@@ -11,34 +11,33 @@ import (
)
const (
nativeTenantsAddr = "admin/tenants"
nativeSeriesAddr = "api/v1/series"
nameLabel = "__name__"
nativeTenantsAddr = "admin/tenants"
nativeMetricNamesAddr = "api/v1/label/__name__/values"
)
// Client is an HTTP client for exporting and importing
// time series via native protocol.
type Client struct {
AuthCfg *auth.Config
Addr string
ExtraLabels []string
DisableHTTPKeepAlive bool
AuthCfg *auth.Config
Addr string
ExtraLabels []string
HTTPClient *http.Client
}
// LabelValues represents series from api/v1/series response
type LabelValues map[string]string
// Response represents response from api/v1/series
// Response represents response from api/v1/label/__name__/values
type Response struct {
Status string `json:"status"`
Series []LabelValues `json:"data"`
Status string `json:"status"`
MetricNames []string `json:"data"`
}
// Explore finds series by provided filter from api/v1/series
func (c *Client) Explore(ctx context.Context, f Filter, tenantID string) (map[string]struct{}, error) {
url := fmt.Sprintf("%s/%s", c.Addr, nativeSeriesAddr)
// Explore finds metric names by provided filter from api/v1/label/__name__/values
func (c *Client) Explore(ctx context.Context, f Filter, tenantID string) ([]string, error) {
url := fmt.Sprintf("%s/%s", c.Addr, nativeMetricNamesAddr)
if tenantID != "" {
url = fmt.Sprintf("%s/select/%s/prometheus/%s", c.Addr, tenantID, nativeSeriesAddr)
url = fmt.Sprintf("%s/select/%s/prometheus/%s", c.Addr, tenantID, nativeMetricNamesAddr)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
@@ -68,21 +67,7 @@ func (c *Client) Explore(ctx context.Context, f Filter, tenantID string) (map[st
if err := resp.Body.Close(); err != nil {
return nil, fmt.Errorf("cannot close series response body: %s", err)
}
names := make(map[string]struct{})
for _, series := range response.Series {
// TODO: consider tweaking /api/v1/series API to return metric names only
// this could make explore response much lighter.
for key, value := range series {
if key != nameLabel {
continue
}
if _, ok := names[value]; ok {
continue
}
names[value] = struct{}{}
}
}
return names, nil
return response.MetricNames, nil
}
// ImportPipe uses pipe reader in request to process data
@@ -169,8 +154,8 @@ func (c *Client) do(req *http.Request, expSC int) (*http.Response, error) {
if c.AuthCfg != nil {
c.AuthCfg.SetHeaders(req, true)
}
var httpClient = &http.Client{Transport: &http.Transport{DisableKeepAlives: c.DisableHTTPKeepAlive}}
resp, err := httpClient.Do(req)
resp, err := c.HTTPClient.Do(req)
if err != nil {
return nil, fmt.Errorf("unexpected error when performing request: %w", err)
}

View File

@@ -121,6 +121,55 @@ func Test_splitDateRange(t *testing.T) {
},
wantErr: false,
},
{
name: "month chunking with one day time range",
args: args{
start: "2022-01-03T11:11:11Z",
end: "2022-01-04T12:12:12Z",
granularity: StepMonth,
},
want: []testTimeRange{
{
"2022-01-03T11:11:11Z",
"2022-01-04T12:12:12Z",
},
},
wantErr: false,
},
{
name: "month chunking with same day time range",
args: args{
start: "2022-01-03T11:11:11Z",
end: "2022-01-03T12:12:12Z",
granularity: StepMonth,
},
want: []testTimeRange{
{
"2022-01-03T11:11:11Z",
"2022-01-03T12:12:12Z",
},
},
wantErr: false,
},
{
name: "month chunking with one month and two days range",
args: args{
start: "2022-01-03T11:11:11Z",
end: "2022-02-03T00:00:00Z",
granularity: StepMonth,
},
want: []testTimeRange{
{
"2022-01-03T11:11:11Z",
"2022-01-31T23:59:59.999999999Z",
},
{
"2022-02-01T00:00:00Z",
"2022-02-03T00:00:00Z",
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {

View File

@@ -0,0 +1,6 @@
package terminal
// IsTerminal returns true if the file descriptor is terminal
func IsTerminal(fd int) bool {
return isTerminal(fd)
}

View File

@@ -0,0 +1,13 @@
//go:build aix || linux || solaris || zos
// +build aix linux solaris zos
package terminal
import "golang.org/x/sys/unix"
const ioctlReadTermios = unix.TCGETS
func isTerminal(fd int) bool {
_, err := unix.IoctlGetTermios(fd, ioctlReadTermios)
return err == nil
}

View File

@@ -0,0 +1,13 @@
//go:build darwin || freebsd || openbsd
// +build darwin freebsd openbsd
package terminal
import "golang.org/x/sys/unix"
const ioctlReadTermios = unix.TIOCGETA
func isTerminal(fd int) bool {
_, err := unix.IoctlGetTermios(fd, ioctlReadTermios)
return err == nil
}

View File

@@ -0,0 +1,8 @@
//go:build windows
// +build windows
package terminal
func isTerminal(fd int) bool {
return true
}

View File

@@ -2,39 +2,78 @@ package remote_read_integration
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"log"
"net/http"
"net/http/httptest"
"reflect"
"sort"
"strconv"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/native/stream"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/vmimport"
)
// LabelValues represents series from api/v1/series response
type LabelValues map[string]string
// Response represents response from api/v1/series
type Response struct {
Status string `json:"status"`
Series []LabelValues `json:"data"`
}
type MetricNamesResponse struct {
Status string `json:"status"`
Data []string `json:"data"`
}
// RemoteWriteServer represents fake remote write server with database
type RemoteWriteServer struct {
server *httptest.Server
series []vm.TimeSeries
server *httptest.Server
series []vm.TimeSeries
expectedSeries []vm.TimeSeries
}
// NewRemoteWriteServer prepares test remote write server
func NewRemoteWriteServer(t *testing.T) *RemoteWriteServer {
rws := &RemoteWriteServer{series: make([]vm.TimeSeries, 0)}
mux := http.NewServeMux()
mux.Handle("/api/v1/import", rws.getWriteHandler(t))
mux.Handle("/health", rws.handlePing())
mux.Handle("/api/v1/series", rws.seriesHandler())
mux.Handle("/api/v1/label/__name__/values", rws.valuesHandler())
mux.Handle("/api/v1/export/native", rws.exportNativeHandler())
mux.Handle("/api/v1/import/native", rws.importNativeHandler(t))
rws.server = httptest.NewServer(mux)
return rws
}
// Close closes the server.
// Close closes the server
func (rws *RemoteWriteServer) Close() {
rws.server.Close()
}
func (rws *RemoteWriteServer) ExpectedSeries(series []vm.TimeSeries) {
// Series saves generated series for fake database
func (rws *RemoteWriteServer) Series(series []vm.TimeSeries) {
rws.series = append(rws.series, series...)
}
// ExpectedSeries saves expected results to check in the handler
func (rws *RemoteWriteServer) ExpectedSeries(series []vm.TimeSeries) {
rws.expectedSeries = append(rws.expectedSeries, series...)
}
// URL returns server url
func (rws *RemoteWriteServer) URL() string {
return rws.server.URL
}
@@ -68,13 +107,14 @@ func (rws *RemoteWriteServer) getWriteHandler(t *testing.T) http.Handler {
rows.Reset()
}
if !reflect.DeepEqual(tss, rws.series) {
if !reflect.DeepEqual(tss, rws.expectedSeries) {
w.WriteHeader(http.StatusInternalServerError)
t.Fatalf("datasets not equal, expected: %#v; \n got: %#v", rws.series, tss)
t.Fatalf("datasets not equal, expected: %#v; \n got: %#v", rws.expectedSeries, tss)
return
}
w.WriteHeader(http.StatusNoContent)
return
})
}
@@ -84,3 +124,186 @@ func (rws *RemoteWriteServer) handlePing() http.Handler {
_, _ = w.Write([]byte("OK"))
})
}
func (rws *RemoteWriteServer) seriesHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var labelValues []LabelValues
for _, ser := range rws.series {
metricNames := make(LabelValues)
if ser.Name != "" {
metricNames["__name__"] = ser.Name
}
for _, p := range ser.LabelPairs {
metricNames[p.Name] = p.Value
}
labelValues = append(labelValues, metricNames)
}
resp := Response{
Status: "success",
Series: labelValues,
}
err := json.NewEncoder(w).Encode(resp)
if err != nil {
log.Printf("error send series: %s", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
})
}
func (rws *RemoteWriteServer) valuesHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
labelNames := make(map[string]struct{})
for _, ser := range rws.series {
if ser.Name != "" {
labelNames[ser.Name] = struct{}{}
}
}
metricNames := make([]string, 0, len(labelNames))
for k := range labelNames {
metricNames = append(metricNames, k)
}
resp := MetricNamesResponse{
Status: "success",
Data: metricNames,
}
buf := bytes.NewBuffer(nil)
err := json.NewEncoder(buf).Encode(resp)
if err != nil {
log.Printf("error send series: %s", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
_, err = w.Write(buf.Bytes())
if err != nil {
log.Printf("error send series: %s", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
return
})
}
func (rws *RemoteWriteServer) exportNativeHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
now := time.Now()
err := prometheus.ExportNativeHandler(now, w, r)
if err != nil {
log.Printf("error export series via native protocol: %s", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
return
})
}
func (rws *RemoteWriteServer) importNativeHandler(t *testing.T) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
common.StartUnmarshalWorkers()
defer common.StopUnmarshalWorkers()
var gotTimeSeries []vm.TimeSeries
var mx sync.RWMutex
err := stream.Parse(r.Body, false, func(block *stream.Block) error {
mn := &block.MetricName
var timeseries vm.TimeSeries
timeseries.Name = string(mn.MetricGroup)
timeseries.Timestamps = append(timeseries.Timestamps, block.Timestamps...)
timeseries.Values = append(timeseries.Values, block.Values...)
for i := range mn.Tags {
tag := &mn.Tags[i]
timeseries.LabelPairs = append(timeseries.LabelPairs, vm.LabelPair{
Name: string(tag.Key),
Value: string(tag.Value),
})
}
mx.Lock()
gotTimeSeries = append(gotTimeSeries, timeseries)
mx.Unlock()
return nil
})
if err != nil {
log.Printf("error parse stream blocks: %s", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
// got timeseries should be sorted
// because they are processed independently
sort.SliceStable(gotTimeSeries, func(i, j int) bool {
iv, jv := gotTimeSeries[i], gotTimeSeries[j]
switch {
case iv.Values[0] != jv.Values[0]:
return iv.Values[0] < jv.Values[0]
case iv.Timestamps[0] != jv.Timestamps[0]:
return iv.Timestamps[0] < jv.Timestamps[0]
default:
return iv.Name < jv.Name
}
})
if !reflect.DeepEqual(gotTimeSeries, rws.expectedSeries) {
w.WriteHeader(http.StatusInternalServerError)
t.Errorf("datasets not equal, expected: %#v;\n got: %#v", rws.expectedSeries, gotTimeSeries)
return
}
w.WriteHeader(http.StatusNoContent)
return
})
}
// GenerateVNSeries generates test timeseries
func GenerateVNSeries(start, end, numOfSeries, numOfSamples int64) []vm.TimeSeries {
var ts []vm.TimeSeries
j := 0
for i := 0; i < int(numOfSeries); i++ {
if i%3 == 0 {
j++
}
timeSeries := vm.TimeSeries{
Name: fmt.Sprintf("vm_metric_%d", j),
LabelPairs: []vm.LabelPair{
{Name: "job", Value: strconv.Itoa(i)},
},
}
ts = append(ts, timeSeries)
}
for i := range ts {
t, v := generateTimeStampsAndValues(i, start, end, numOfSamples)
ts[i].Timestamps = t
ts[i].Values = v
}
return ts
}
func generateTimeStampsAndValues(idx int, startTime, endTime, numOfSamples int64) ([]int64, []float64) {
delta := (endTime - startTime) / numOfSamples
var timestamps []int64
var values []float64
t := startTime
for t != endTime {
v := 100 * int64(idx)
timestamps = append(timestamps, t*1000)
values = append(values, float64(v))
t = t + delta
}
return timestamps, values
}

31
app/vmctl/utils/time.go Normal file
View File

@@ -0,0 +1,31 @@
package utils
import (
"fmt"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
const (
// These values prevent from overflow when storing msec-precision time in int64.
minTimeMsecs = 0 // use 0 instead of `int64(-1<<63) / 1e6` because the storage engine doesn't actually support negative time
maxTimeMsecs = int64(1<<63-1) / 1e6
)
// GetTime returns time from the given string.
func GetTime(s string) (time.Time, error) {
secs, err := promutils.ParseTime(s)
if err != nil {
return time.Time{}, fmt.Errorf("cannot parse %s: %w", s, err)
}
msecs := int64(secs * 1e3)
if msecs < minTimeMsecs {
msecs = 0
}
if msecs > maxTimeMsecs {
msecs = maxTimeMsecs
}
return time.Unix(0, msecs*int64(time.Millisecond)).UTC(), nil
}

View File

@@ -0,0 +1,182 @@
package utils
import (
"testing"
"time"
)
func TestGetTime(t *testing.T) {
l, _ := time.LoadLocation("UTC")
tests := []struct {
name string
s string
want func() time.Time
wantErr bool
}{
{
name: "empty string",
s: "",
want: func() time.Time { return time.Time{} },
wantErr: true,
},
{
name: "only year",
s: "2019",
want: func() time.Time {
t := time.Date(2019, 1, 1, 0, 0, 0, 0, l)
return t
},
},
{
name: "year and month",
s: "2019-01",
want: func() time.Time {
t := time.Date(2019, 1, 1, 0, 0, 0, 0, l)
return t
},
},
{
name: "year and not first month",
s: "2019-02",
want: func() time.Time {
t := time.Date(2019, 2, 1, 0, 0, 0, 0, l)
return t
},
},
{
name: "year, month and day",
s: "2019-02-01",
want: func() time.Time {
t := time.Date(2019, 2, 1, 0, 0, 0, 0, l)
return t
},
},
{
name: "year, month and not first day",
s: "2019-02-10",
want: func() time.Time {
t := time.Date(2019, 2, 10, 0, 0, 0, 0, l)
return t
},
},
{
name: "year, month, day and time",
s: "2019-02-02T00",
want: func() time.Time {
t := time.Date(2019, 2, 2, 0, 0, 0, 0, l)
return t
},
},
{
name: "year, month, day and one hour time",
s: "2019-02-02T01",
want: func() time.Time {
t := time.Date(2019, 2, 2, 1, 0, 0, 0, l)
return t
},
},
{
name: "time with zero minutes",
s: "2019-02-02T01:00",
want: func() time.Time {
t := time.Date(2019, 2, 2, 1, 0, 0, 0, l)
return t
},
},
{
name: "time with one minute",
s: "2019-02-02T01:01",
want: func() time.Time {
t := time.Date(2019, 2, 2, 1, 1, 0, 0, l)
return t
},
},
{
name: "time with zero seconds",
s: "2019-02-02T01:01:00",
want: func() time.Time {
t := time.Date(2019, 2, 2, 1, 1, 0, 0, l)
return t
},
},
{
name: "timezone with one second",
s: "2019-02-02T01:01:01",
want: func() time.Time {
t := time.Date(2019, 2, 2, 1, 1, 1, 0, l)
return t
},
},
{
name: "time with two second and timezone",
s: "2019-07-07T20:01:02Z",
want: func() time.Time {
t := time.Date(2019, 7, 7, 20, 1, 02, 0, l)
return t
},
},
{
name: "time with seconds and timezone",
s: "2019-07-07T20:47:40+03:00",
want: func() time.Time {
l, _ = time.LoadLocation("Europe/Kiev")
t := time.Date(2019, 7, 7, 20, 47, 40, 0, l)
return t
},
},
{
name: "negative time",
s: "-292273086-05-16T16:47:06Z",
want: func() time.Time { return time.Time{} },
wantErr: true,
},
{
name: "float timestamp representation",
s: "1562529662.324",
want: func() time.Time {
t := time.Date(2019, 7, 7, 23, 01, 02, 324, l)
return t
},
},
{
name: "negative timestamp",
s: "-9223372036.855",
want: func() time.Time {
l, _ = time.LoadLocation("Europe/Kiev")
return time.Date(1970, 01, 01, 03, 00, 00, 00, l)
},
wantErr: false,
},
{
name: "big timestamp",
s: "9223372036.855",
want: func() time.Time {
l, _ = time.LoadLocation("Europe/Kiev")
t := time.Date(2262, 04, 12, 02, 47, 16, 855, l)
return t
},
wantErr: false,
},
{
name: "duration time",
s: "1h5m",
want: func() time.Time {
t := time.Now().Add(-1 * time.Hour).Add(-5 * time.Minute)
return t
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetTime(tt.s)
if (err != nil) != tt.wantErr {
t.Errorf("ParseTime() error = %v, wantErr %v", err, tt.wantErr)
return
}
w := tt.want()
if got.Unix() != w.Unix() {
t.Errorf("ParseTime() got = %v, want %v", got, w)
}
})
}
}

View File

@@ -5,6 +5,7 @@ import (
"fmt"
"io"
"log"
"strings"
"sync"
"time"
@@ -12,7 +13,9 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/limiter"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/stepper"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/searchutils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/cheggaaa/pb/v3"
)
@@ -24,16 +27,18 @@ type vmNativeProcessor struct {
src *native.Client
backoff *backoff.Backoff
s *stats
rateLimit int64
interCluster bool
cc int
s *stats
rateLimit int64
interCluster bool
cc int
disableRetries bool
}
const (
nativeExportAddr = "api/v1/export/native"
nativeImportAddr = "api/v1/import/native"
nativeBarTpl = `{{ blue "%s:" }} {{ counters . }} {{ bar . "[" "█" (cycle . "█") "▒" "]" }} {{ percent . }}`
nativeExportAddr = "api/v1/export/native"
nativeImportAddr = "api/v1/import/native"
nativeWithBackoffTpl = `{{ blue "%s:" }} {{ counters . }} {{ bar . "[" "█" (cycle . "█") "▒" "]" }} {{ percent . }}`
nativeSingleProcessTpl = `Total: {{counters . }} {{ cycle . "↖" "↗" "↘" "↙" }} Speed: {{speed . }} {{string . "suffix"}}`
)
func (p *vmNativeProcessor) run(ctx context.Context, silent bool) error {
@@ -44,18 +49,16 @@ func (p *vmNativeProcessor) run(ctx context.Context, silent bool) error {
startTime: time.Now(),
}
start, err := time.Parse(time.RFC3339, p.filter.TimeStart)
start, err := utils.GetTime(p.filter.TimeStart)
if err != nil {
return fmt.Errorf("failed to parse %s, provided: %s, expected format: %s, error: %w",
vmNativeFilterTimeStart, p.filter.TimeStart, time.RFC3339, err)
return fmt.Errorf("failed to parse %s, provided: %s, error: %w", vmNativeFilterTimeStart, p.filter.TimeStart, err)
}
end := time.Now().In(start.Location())
if p.filter.TimeEnd != "" {
end, err = time.Parse(time.RFC3339, p.filter.TimeEnd)
end, err = utils.GetTime(p.filter.TimeEnd)
if err != nil {
return fmt.Errorf("failed to parse %s, provided: %s, expected format: %s, error: %w",
vmNativeFilterTimeEnd, p.filter.TimeEnd, time.RFC3339, err)
return fmt.Errorf("failed to parse %s, provided: %s, error: %w", vmNativeFilterTimeEnd, p.filter.TimeEnd, err)
}
}
@@ -93,27 +96,32 @@ func (p *vmNativeProcessor) run(ctx context.Context, silent bool) error {
return nil
}
func (p *vmNativeProcessor) do(ctx context.Context, f native.Filter, srcURL, dstURL string) error {
func (p *vmNativeProcessor) do(ctx context.Context, f native.Filter, srcURL, dstURL string, bar *pb.ProgressBar) error {
retryableFunc := func() error { return p.runSingle(ctx, f, srcURL, dstURL) }
retryableFunc := func() error { return p.runSingle(ctx, f, srcURL, dstURL, bar) }
attempts, err := p.backoff.Retry(ctx, retryableFunc)
p.s.Lock()
p.s.retries += attempts
p.s.Unlock()
if err != nil {
return fmt.Errorf("failed to migrate from %s to %s (retry attempts: %d): %w\nwith fileter %s", srcURL, dstURL, attempts, err, f)
return fmt.Errorf("failed to migrate from %s to %s (retry attempts: %d): %w\nwith filter %s", srcURL, dstURL, attempts, err, f)
}
return nil
}
func (p *vmNativeProcessor) runSingle(ctx context.Context, f native.Filter, srcURL, dstURL string) error {
func (p *vmNativeProcessor) runSingle(ctx context.Context, f native.Filter, srcURL, dstURL string, bar *pb.ProgressBar) error {
exportReader, err := p.src.ExportPipe(ctx, srcURL, f)
reader, err := p.src.ExportPipe(ctx, srcURL, f)
if err != nil {
return fmt.Errorf("failed to init export pipe: %w", err)
}
if p.disableRetries && bar != nil {
fmt.Printf("Continue import process with filter %s:\n", f.String())
reader = bar.NewProxyReader(reader)
}
pr, pw := io.Pipe()
done := make(chan struct{})
go func() {
@@ -130,7 +138,7 @@ func (p *vmNativeProcessor) runSingle(ctx context.Context, f native.Filter, srcU
w = limiter.NewWriteLimiter(pw, rl)
}
written, err := io.Copy(w, exportReader)
written, err := io.Copy(w, reader)
if err != nil {
return fmt.Errorf("failed to write into %q: %s", p.dst.Addr, err)
}
@@ -175,17 +183,22 @@ func (p *vmNativeProcessor) runBackfilling(ctx context.Context, tenantID string,
fmt.Println("") // extra line for better output formatting
log.Printf(initMessage, initParams...)
log.Printf("Exploring metrics...")
metrics, err := p.src.Explore(ctx, p.filter, tenantID)
if err != nil {
return fmt.Errorf("cannot get metrics from source %s: %w", p.src.Addr, err)
var foundSeriesMsg string
metrics := []string{p.filter.Match}
if !p.disableRetries {
log.Printf("Exploring metrics...")
metrics, err = p.src.Explore(ctx, p.filter, tenantID)
if err != nil {
return fmt.Errorf("cannot get metrics from source %s: %w", p.src.Addr, err)
}
if len(metrics) == 0 {
return fmt.Errorf("no metrics found")
}
foundSeriesMsg = fmt.Sprintf("Found %d metrics to import", len(metrics))
}
if len(metrics) == 0 {
return fmt.Errorf("no metrics found")
}
foundSeriesMsg := fmt.Sprintf("Found %d metrics to import", len(metrics))
if !p.interCluster {
// do not prompt for intercluster because there could be many tenants,
// and we don't want to interrupt the process when moving to the next tenant.
@@ -205,7 +218,10 @@ func (p *vmNativeProcessor) runBackfilling(ctx context.Context, tenantID string,
var bar *pb.ProgressBar
if !silent {
bar = pb.ProgressBarTemplate(fmt.Sprintf(nativeBarTpl, barPrefix)).New(len(metrics) * len(ranges))
bar = pb.ProgressBarTemplate(fmt.Sprintf(nativeWithBackoffTpl, barPrefix)).New(len(metrics) * len(ranges))
if p.disableRetries {
bar = pb.ProgressBarTemplate(nativeSingleProcessTpl).New(0)
}
bar.Start()
defer bar.Finish()
}
@@ -219,19 +235,33 @@ func (p *vmNativeProcessor) runBackfilling(ctx context.Context, tenantID string,
go func() {
defer wg.Done()
for f := range filterCh {
if err := p.do(ctx, f, srcURL, dstURL); err != nil {
errCh <- err
return
}
if bar != nil {
bar.Increment()
if !p.disableRetries {
if err := p.do(ctx, f, srcURL, dstURL, nil); err != nil {
errCh <- err
return
}
if bar != nil {
bar.Increment()
}
} else {
if err := p.runSingle(ctx, f, srcURL, dstURL, bar); err != nil {
errCh <- err
return
}
}
}
}()
}
// any error breaks the import
for s := range metrics {
for _, s := range metrics {
match, err := buildMatchWithFilter(p.filter.Match, s)
if err != nil {
logger.Errorf("failed to build export filters: %s", err)
continue
}
for _, times := range ranges {
select {
case <-ctx.Done():
@@ -239,7 +269,7 @@ func (p *vmNativeProcessor) runBackfilling(ctx context.Context, tenantID string,
case infErr := <-errCh:
return fmt.Errorf("native error: %s", infErr)
case filterCh <- native.Filter{
Match: fmt.Sprintf("{%s=%q}", nameLabel, s),
Match: match,
TimeStart: times[0].Format(time.RFC3339),
TimeEnd: times[1].Format(time.RFC3339),
}:
@@ -303,3 +333,28 @@ func byteCountSI(b int64) string {
return fmt.Sprintf("%.1f %cB",
float64(b)/float64(div), "kMGTPE"[exp])
}
func buildMatchWithFilter(filter string, metricName string) (string, error) {
if filter == metricName {
return filter, nil
}
labels, err := searchutils.ParseMetricSelector(filter)
if err != nil {
return "", err
}
str := make([]string, 0, len(labels))
for _, label := range labels {
if len(label.Key) == 0 {
continue
}
str = append(str, label.String())
}
nameFilter := fmt.Sprintf("__name__=%q", metricName)
str = append(str, nameFilter)
match := fmt.Sprintf("{%s}", strings.Join(str, ","))
return match, nil
}

View File

@@ -2,118 +2,389 @@ package main
import (
"context"
"flag"
"fmt"
"log"
"net/http"
"os"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/backoff"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/stepper"
remote_read_integration "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/testdata/servers_integration_test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
)
// If you want to run this test:
// 1. run two instances of victoriametrics and define -httpListenAddr for both or just for second instance
// 2. define srcAddr and dstAddr const with your victoriametrics addresses
// 3. define matchFilter const with your importing data
// 4. define timeStartFilter
// 5. run each test one by one
const (
matchFilter = `{job="avalanche"}`
timeStartFilter = "2020-01-01T20:07:00Z"
timeEndFilter = "2020-08-01T20:07:00Z"
srcAddr = "http://127.0.0.1:8428"
dstAddr = "http://127.0.0.1:8528"
storagePath = "TestStorage"
retentionPeriod = "100y"
)
// This test simulates close process if user abort it
func Test_vmNativeProcessor_run(t *testing.T) {
t.Skip()
processFlags()
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
defer func() {
vmstorage.Stop()
if err := os.RemoveAll(storagePath); err != nil {
log.Fatalf("cannot remove %q: %s", storagePath, err)
}
}()
type fields struct {
filter native.Filter
rateLimit int64
dst *native.Client
src *native.Client
filter native.Filter
dst *native.Client
src *native.Client
backoff *backoff.Backoff
s *stats
rateLimit int64
interCluster bool
cc int
matchName string
matchValue string
}
type args struct {
ctx context.Context
silent bool
}
tests := []struct {
name string
fields fields
closer func(cancelFunc context.CancelFunc)
wantErr bool
name string
fields fields
args args
vmSeries func(start, end, numOfSeries, numOfSamples int64) []vm.TimeSeries
expectedSeries []vm.TimeSeries
start string
end string
numOfSamples int64
numOfSeries int64
chunk string
wantErr bool
}{
{
name: "simulate syscall.SIGINT",
name: "step minute on minute time range",
start: "2022-11-25T11:23:05+02:00",
end: "2022-11-27T11:24:05+02:00",
numOfSamples: 2,
numOfSeries: 3,
chunk: stepper.StepMinute,
fields: fields{
filter: native.Filter{
Match: matchFilter,
TimeStart: timeStartFilter,
filter: native.Filter{},
backoff: backoff.New(),
rateLimit: 0,
interCluster: false,
cc: 1,
matchName: "__name__",
matchValue: ".*",
},
args: args{
ctx: context.Background(),
silent: true,
},
vmSeries: remote_read_integration.GenerateVNSeries,
expectedSeries: []vm.TimeSeries{
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}},
Timestamps: []int64{1669368185000, 1669454615000},
Values: []float64{0, 0},
},
rateLimit: 0,
dst: &native.Client{
Addr: dstAddr,
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}},
Timestamps: []int64{1669368185000, 1669454615000},
Values: []float64{100, 100},
},
src: &native.Client{
Addr: srcAddr,
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}},
Timestamps: []int64{1669368185000, 1669454615000},
Values: []float64{200, 200},
},
},
closer: func(cancelFunc context.CancelFunc) {
time.Sleep(time.Second * 5)
cancelFunc()
},
wantErr: true,
},
{
name: "simulate correct work",
fields: fields{
filter: native.Filter{
Match: matchFilter,
TimeStart: timeStartFilter,
},
rateLimit: 0,
dst: &native.Client{
Addr: dstAddr,
},
src: &native.Client{
Addr: srcAddr,
},
},
closer: func(cancelFunc context.CancelFunc) {},
wantErr: false,
},
{
name: "simulate correct work with chunking",
name: "step month on month time range",
start: "2022-09-26T11:23:05+02:00",
end: "2022-11-26T11:24:05+02:00",
numOfSamples: 2,
numOfSeries: 3,
chunk: stepper.StepMonth,
fields: fields{
filter: native.Filter{
Match: matchFilter,
TimeStart: timeStartFilter,
TimeEnd: timeEndFilter,
Chunk: stepper.StepMonth,
filter: native.Filter{},
backoff: backoff.New(),
rateLimit: 0,
interCluster: false,
cc: 1,
matchName: "__name__",
matchValue: ".*",
},
args: args{
ctx: context.Background(),
silent: true,
},
vmSeries: remote_read_integration.GenerateVNSeries,
expectedSeries: []vm.TimeSeries{
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}},
Timestamps: []int64{1664184185000},
Values: []float64{0},
},
rateLimit: 0,
dst: &native.Client{
Addr: dstAddr,
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}},
Timestamps: []int64{1666819415000},
Values: []float64{0},
},
src: &native.Client{
Addr: srcAddr,
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}},
Timestamps: []int64{1664184185000},
Values: []float64{100},
},
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}},
Timestamps: []int64{1666819415000},
Values: []float64{100},
},
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}},
Timestamps: []int64{1664184185000},
Values: []float64{200},
},
{
Name: "vm_metric_1",
LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}},
Timestamps: []int64{1666819415000},
Values: []float64{200},
},
},
closer: func(cancelFunc context.CancelFunc) {},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx, cancelFn := context.WithCancel(context.Background())
p := &vmNativeProcessor{
filter: tt.fields.filter,
rateLimit: tt.fields.rateLimit,
dst: tt.fields.dst,
src: tt.fields.src,
src := remote_read_integration.NewRemoteWriteServer(t)
dst := remote_read_integration.NewRemoteWriteServer(t)
defer func() {
src.Close()
dst.Close()
}()
start, err := time.Parse(time.RFC3339, tt.start)
if err != nil {
t.Fatalf("Error parse start time: %s", err)
}
tt.closer(cancelFn)
end, err := time.Parse(time.RFC3339, tt.end)
if err != nil {
t.Fatalf("Error parse end time: %s", err)
}
if err := p.run(ctx, true); (err != nil) != tt.wantErr {
tt.fields.filter.Match = fmt.Sprintf("{%s=~%q}", tt.fields.matchName, tt.fields.matchValue)
tt.fields.filter.TimeStart = tt.start
tt.fields.filter.TimeEnd = tt.end
rws := tt.vmSeries(start.Unix(), end.Unix(), tt.numOfSeries, tt.numOfSamples)
src.Series(rws)
dst.ExpectedSeries(tt.expectedSeries)
if err := fillStorage(rws); err != nil {
t.Fatalf("error add series to storage: %s", err)
}
tt.fields.src = &native.Client{
AuthCfg: nil,
Addr: src.URL(),
ExtraLabels: []string{},
HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}},
}
tt.fields.dst = &native.Client{
AuthCfg: nil,
Addr: dst.URL(),
ExtraLabels: []string{},
HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}},
}
p := &vmNativeProcessor{
filter: tt.fields.filter,
dst: tt.fields.dst,
src: tt.fields.src,
backoff: tt.fields.backoff,
s: tt.fields.s,
rateLimit: tt.fields.rateLimit,
interCluster: tt.fields.interCluster,
cc: tt.fields.cc,
}
if err := p.run(tt.args.ctx, tt.args.silent); (err != nil) != tt.wantErr {
t.Errorf("run() error = %v, wantErr %v", err, tt.wantErr)
}
deleted, err := deleteSeries(tt.fields.matchName, tt.fields.matchValue)
if err != nil {
t.Fatalf("error delete series: %s", err)
}
if int64(deleted) != tt.numOfSeries {
t.Fatalf("expected deleted series %d; got deleted series %d", tt.numOfSeries, deleted)
}
})
}
}
func processFlags() {
flag.Parse()
for _, fv := range []struct {
flag string
value string
}{
{flag: "storageDataPath", value: storagePath},
{flag: "retentionPeriod", value: retentionPeriod},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
log.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
}
}
}
func fillStorage(series []vm.TimeSeries) error {
var mrs []storage.MetricRow
for _, series := range series {
var labels []prompb.Label
for _, lp := range series.LabelPairs {
labels = append(labels, prompb.Label{Name: []byte(lp.Name), Value: []byte(lp.Value)})
}
if series.Name != "" {
labels = append(labels, prompb.Label{Name: []byte("__name__"), Value: []byte(series.Name)})
}
mr := storage.MetricRow{}
mr.MetricNameRaw = storage.MarshalMetricNameRaw(mr.MetricNameRaw[:0], labels)
timestamps := series.Timestamps
values := series.Values
for i, value := range values {
mr.Timestamp = timestamps[i]
mr.Value = value
mrs = append(mrs, mr)
}
}
if err := vmstorage.AddRows(mrs); err != nil {
return fmt.Errorf("unexpected error in AddRows: %s", err)
}
vmstorage.Storage.DebugFlush()
return nil
}
func deleteSeries(name, value string) (int, error) {
tfs := storage.NewTagFilters()
if err := tfs.Add([]byte(name), []byte(value), false, true); err != nil {
return 0, fmt.Errorf("unexpected error in TagFilters.Add: %w", err)
}
return vmstorage.DeleteSeries(nil, []*storage.TagFilters{tfs})
}
func Test_buildMatchWithFilter(t *testing.T) {
tests := []struct {
name string
filter string
metricName string
want string
wantErr bool
}{
{
name: "parsed metric with label",
filter: `{__name__="http_request_count_total",cluster="kube1"}`,
metricName: "http_request_count_total",
want: `{cluster="kube1",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "metric name with label",
filter: `http_request_count_total{cluster="kube1"}`,
metricName: "http_request_count_total",
want: `{cluster="kube1",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "parsed metric with regexp value",
filter: `{__name__="http_request_count_total",cluster=~"kube.*"}`,
metricName: "http_request_count_total",
want: `{cluster=~"kube.*",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "only label with regexp",
filter: `{cluster=~".*"}`,
metricName: "http_request_count_total",
want: `{cluster=~".*",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "many labels in filter with regexp",
filter: `{cluster=~".*",job!=""}`,
metricName: "http_request_count_total",
want: `{cluster=~".*",job!="",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "match with error",
filter: `{cluster~=".*"}`,
metricName: "http_request_count_total",
want: ``,
wantErr: true,
},
{
name: "all names",
filter: `{__name__!=""}`,
metricName: "http_request_count_total",
want: `{__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "with many underscores labels",
filter: `{__name__!="", __meta__!=""}`,
metricName: "http_request_count_total",
want: `{__meta__!="",__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "metric name has regexp",
filter: `{__name__=~".*"}`,
metricName: "http_request_count_total",
want: `{__name__="http_request_count_total"}`,
wantErr: false,
},
{
name: "metric name has negative regexp",
filter: `{__name__!~".*"}`,
metricName: "http_request_count_total",
want: `{__name__="http_request_count_total"}`,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := buildMatchWithFilter(tt.filter, tt.metricName)
if (err != nil) != tt.wantErr {
t.Errorf("buildMatchWithFilter() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("buildMatchWithFilter() got = %v, want %v", got, tt.want)
}
})
}
}

View File

@@ -44,7 +44,7 @@ jwt token must be in following format:
Where:
* `exp` - required, expire time in unix_timestamp. If the token expires then `vmgateway` rejects the request.
* `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenand_id": {}}`
* `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenant_id": {}}`
* `tenant_id` - optional, for cluster mode, routes requests to the corresponding tenant.
* `extra_labels` - optional, key-value pairs for label filters added to the ingested or selected metrics. Multiple filters are added with `and` operation. If defined, `extra_label` from original request removed.
* `extra_filters` - optional, [series selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) added to the select query requests. Multiple selectors are added with `or` operation. If defined, `extra_filter` from original request removed.
@@ -144,7 +144,7 @@ EOF
./bin/vmselect -eula -storageNode 127.0.0.1:8401
./bin/vminsert -eula -storageNode 127.0.0.1:8400
# create base rate limitng config:
# create base rate limiting config:
cat << EOF > limit.yaml
limits:
- type: queries
@@ -310,7 +310,7 @@ The shortlist of configuration flags include the following:
-datasource.queryStep duration
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
-datasource.queryTimeAlignment
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite of number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.showURL
@@ -332,7 +332,7 @@ The shortlist of configuration flags include the following:
-enable.rateLimit
enables rate limiter
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -342,11 +342,11 @@ The shortlist of configuration flags include the following:
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -396,7 +396,7 @@ The shortlist of configuration flags include the following:
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-ratelimit.config string
path for configuration file. Accepts url address

View File

@@ -137,7 +137,8 @@ func (ctx *InsertCtx) ApplyRelabeling() {
// FlushBufs flushes buffered rows to the underlying storage.
func (ctx *InsertCtx) FlushBufs() error {
if sa != nil && !ctx.skipStreamAggr {
sas := sasGlobal.Load()
if sas != nil && !ctx.skipStreamAggr {
ctx.streamAggrCtx.push(ctx.mrs)
if !*streamAggrKeepInput {
ctx.Reset(0)

View File

@@ -2,15 +2,20 @@ package common
import (
"flag"
"fmt"
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
"github.com/VictoriaMetrics/metrics"
)
var (
@@ -18,34 +23,103 @@ var (
"See https://docs.victoriametrics.com/stream-aggregation.html . "+
"See also -remoteWrite.streamAggr.keepInput and -streamAggr.dedupInterval")
streamAggrKeepInput = flag.Bool("streamAggr.keepInput", false, "Whether to keep input samples after the aggregation with -streamAggr.config. "+
"By default the input is dropped after the aggregation, so only the aggregate data is stored. "+
"By default, the input is dropped after the aggregation, so only the aggregate data is stored. "+
"See https://docs.victoriametrics.com/stream-aggregation.html")
streamAggrDedupInterval = flag.Duration("streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval before being aggregated. "+
"Only the last sample per each time series per each interval is aggregated if the interval is greater than zero")
)
var (
saCfgReloaderStopCh = make(chan struct{})
saCfgReloaderWG sync.WaitGroup
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
saCfgReloadErr = metrics.NewCounter(`vminsert_streamagg_config_reloads_errors_total`)
saCfgSuccess = metrics.NewCounter(`vminsert_streamagg_config_last_reload_successful`)
saCfgTimestamp = metrics.NewCounter(`vminsert_streamagg_config_last_reload_success_timestamp_seconds`)
sasGlobal atomic.Pointer[streamaggr.Aggregators]
)
// CheckStreamAggrConfig checks config pointed by -stramaggr.config
func CheckStreamAggrConfig() error {
if *streamAggrConfig == "" {
return nil
}
pushNoop := func(tss []prompbmarshal.TimeSeries) {}
sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushNoop, *streamAggrDedupInterval)
if err != nil {
return fmt.Errorf("error when loading -streamAggr.config=%q: %w", *streamAggrConfig, err)
}
sas.MustStop()
return nil
}
// InitStreamAggr must be called after flag.Parse and before using the common package.
//
// MustStopStreamAggr must be called when stream aggr is no longer needed.
func InitStreamAggr() {
if *streamAggrConfig == "" {
// Nothing to initialize
return
}
a, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, *streamAggrDedupInterval)
sighupCh := procutil.NewSighupChan()
sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, *streamAggrDedupInterval)
if err != nil {
logger.Fatalf("cannot load -streamAggr.config=%q: %s", *streamAggrConfig, err)
}
sa = a
sasGlobal.Store(sas)
saCfgSuccess.Set(1)
saCfgTimestamp.Set(fasttime.UnixTimestamp())
// Start config reloader.
saCfgReloaderWG.Add(1)
go func() {
defer saCfgReloaderWG.Done()
for {
select {
case <-sighupCh:
case <-saCfgReloaderStopCh:
return
}
reloadStreamAggrConfig()
}
}()
}
func reloadStreamAggrConfig() {
logger.Infof("reloading -streamAggr.config=%q", *streamAggrConfig)
saCfgReloads.Inc()
sasNew, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, *streamAggrDedupInterval)
if err != nil {
saCfgSuccess.Set(0)
saCfgReloadErr.Inc()
logger.Errorf("cannot reload -streamAggr.config=%q: use the previously loaded config; error: %s", *streamAggrConfig, err)
return
}
sas := sasGlobal.Load()
if !sasNew.Equal(sas) {
sasOld := sasGlobal.Swap(sasNew)
sasOld.MustStop()
logger.Infof("successfully reloaded stream aggregation config at -streamAggr.config=%q", *streamAggrConfig)
} else {
logger.Infof("nothing changed in -streamAggr.config=%q", *streamAggrConfig)
sasNew.MustStop()
}
saCfgSuccess.Set(1)
saCfgTimestamp.Set(fasttime.UnixTimestamp())
}
// MustStopStreamAggr stops stream aggregators.
func MustStopStreamAggr() {
sa.MustStop()
sa = nil
}
close(saCfgReloaderStopCh)
saCfgReloaderWG.Wait()
var sa *streamaggr.Aggregators
sas := sasGlobal.Swap(nil)
sas.MustStop()
}
type streamAggrCtx struct {
mn storage.MetricName
@@ -64,6 +138,7 @@ func (ctx *streamAggrCtx) push(mrs []storage.MetricRow) {
ts := &tss[0]
labels := ts.Labels
samples := ts.Samples
sas := sasGlobal.Load()
for _, mr := range mrs {
if err := mn.UnmarshalRaw(mr.MetricNameRaw); err != nil {
logger.Panicf("BUG: cannot unmarshal recently marshaled MetricName: %s", err)
@@ -88,7 +163,7 @@ func (ctx *streamAggrCtx) push(mrs []storage.MetricRow) {
ts.Labels = labels
ts.Samples = samples
sa.Push(tss)
sas.Push(tss)
}
}
@@ -99,6 +174,7 @@ func pushAggregateSeries(tss []prompbmarshal.TimeSeries) {
ctx.skipStreamAggr = true
for _, ts := range tss {
labels := ts.Labels
ctx.Labels = ctx.Labels[:0]
for _, label := range labels {
name := label.Name
if name == "__name__" {

View File

@@ -21,7 +21,7 @@ import (
var (
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metric name if InfluxDB line contains only a single field")
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
dbLabel = flag.String("influxDBLabel", "db", "Default label for the DB name sent over '?db={db_name}' query parameter")
)

View File

@@ -71,6 +71,12 @@ var (
var pcsGlobal atomic.Value
// CheckRelabelConfig checks config pointed by -relabelConfig
func CheckRelabelConfig() error {
_, err := loadRelabelConfig()
return err
}
func loadRelabelConfig() (*promrelabel.ParsedConfigs, error) {
if len(*relabelConfig) == 0 {
return nil, nil

View File

@@ -39,6 +39,9 @@ vmrestore-freebsd-amd64-prod:
vmrestore-openbsd-amd64-prod:
APP_NAME=vmrestore $(MAKE) app-via-docker-openbsd-amd64
vmrestore-windows-amd64-prod:
APP_NAME=vmrestore $(MAKE) app-via-docker-windows-amd64
package-vmrestore:
APP_NAME=vmrestore $(MAKE) package-via-docker
@@ -93,5 +96,8 @@ vmrestore-freebsd-amd64:
vmrestore-openbsd-amd64:
APP_NAME=vmrestore CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmrestore-windows-amd64:
GOARCH=amd64 APP_NAME=vmrestore $(MAKE) app-local-windows-goarch
vmrestore-pure:
APP_NAME=vmrestore $(MAKE) app-local-pure

View File

@@ -94,7 +94,7 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-customS3Endpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
-envflag.prefix string
@@ -104,11 +104,11 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-flagsAuthKey string
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
@@ -157,7 +157,7 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
-pushmetrics.interval duration
Interval for pushing metrics to -pushmetrics.url (default 10s)
-pushmetrics.url array
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default metrics exposed at /metrics page aren't pushed to any remote storage
Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage
Supports an array of values separated by comma or specified via multiple flags.
-s3ForcePathStyle
Prefixing endpoint with bucket name when set false, true by default. (default true)

View File

@@ -0,0 +1,259 @@
package graphite
import (
"fmt"
"math"
"strings"
"sync"
"github.com/valyala/histogram"
)
var aggrFuncs = map[string]aggrFunc{
"average": aggrAvg,
"avg": aggrAvg,
"avg_zero": aggrAvgZero,
"median": aggrMedian,
"sum": aggrSum,
"total": aggrSum,
"min": aggrMin,
"max": aggrMax,
"diff": aggrDiff,
"pow": aggrPow,
"stddev": aggrStddev,
"count": aggrCount,
"range": aggrRange,
"rangeOf": aggrRange,
"multiply": aggrMultiply,
"first": aggrFirst,
"last": aggrLast,
"current": aggrLast,
}
func getAggrFunc(funcName string) (aggrFunc, error) {
s := strings.TrimSuffix(funcName, "Series")
aggrFunc := aggrFuncs[s]
if aggrFunc == nil {
return nil, fmt.Errorf("unsupported aggregate function %q", funcName)
}
return aggrFunc, nil
}
type aggrFunc func(values []float64) float64
func (af aggrFunc) apply(xFilesFactor float64, values []float64) float64 {
if aggrCount(values) >= float64(len(values))*xFilesFactor {
return af(values)
}
return nan
}
func aggrAvg(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
sum := values[pos]
count := 1
for _, v := range values[pos+1:] {
if !math.IsNaN(v) {
sum += v
count++
}
}
return sum / float64(count)
}
func aggrAvgZero(values []float64) float64 {
if len(values) == 0 {
return nan
}
sum := float64(0)
for _, v := range values {
if !math.IsNaN(v) {
sum += v
}
}
return sum / float64(len(values))
}
var aggrMedian = newAggrFuncPercentile(50)
func aggrSum(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
sum := values[pos]
for _, v := range values[pos+1:] {
if !math.IsNaN(v) {
sum += v
}
}
return sum
}
func aggrMin(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
min := values[pos]
for _, v := range values[pos+1:] {
if !math.IsNaN(v) && v < min {
min = v
}
}
return min
}
func aggrMax(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
max := values[pos]
for _, v := range values[pos+1:] {
if !math.IsNaN(v) && v > max {
max = v
}
}
return max
}
func aggrDiff(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
sum := float64(0)
for _, v := range values[pos+1:] {
if !math.IsNaN(v) {
sum += v
}
}
return values[pos] - sum
}
func aggrPow(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
pow := values[pos]
for _, v := range values[pos+1:] {
if !math.IsNaN(v) {
pow = math.Pow(pow, v)
}
}
return pow
}
func aggrStddev(values []float64) float64 {
avg := aggrAvg(values)
if math.IsNaN(avg) {
return nan
}
sum := float64(0)
count := 0
for _, v := range values {
if !math.IsNaN(v) {
d := avg - v
sum += d * d
count++
}
}
return math.Sqrt(sum / float64(count))
}
func aggrCount(values []float64) float64 {
count := 0
for _, v := range values {
if !math.IsNaN(v) {
count++
}
}
return float64(count)
}
func aggrRange(values []float64) float64 {
min := aggrMin(values)
if math.IsNaN(min) {
return nan
}
max := aggrMax(values)
return max - min
}
func aggrMultiply(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
p := values[pos]
for _, v := range values[pos+1:] {
if !math.IsNaN(v) {
p *= v
}
}
return p
}
func aggrFirst(values []float64) float64 {
pos := getFirstNonNaNPos(values)
if pos < 0 {
return nan
}
return values[pos]
}
func aggrLast(values []float64) float64 {
for i := len(values) - 1; i >= 0; i-- {
v := values[i]
if !math.IsNaN(v) {
return v
}
}
return nan
}
func getFirstNonNaNPos(values []float64) int {
for i, v := range values {
if !math.IsNaN(v) {
return i
}
}
return -1
}
var nan = math.NaN()
func newAggrFuncPercentile(n float64) aggrFunc {
f := func(values []float64) float64 {
h := getHistogram()
for _, v := range values {
if !math.IsNaN(v) {
h.Update(v)
}
}
p := h.Quantile(n / 100)
putHistogram(h)
return p
}
return f
}
func getHistogram() *histogram.Fast {
return histogramPool.Get().(*histogram.Fast)
}
func putHistogram(h *histogram.Fast) {
h.Reset()
histogramPool.Put(h)
}
var histogramPool = &sync.Pool{
New: func() interface{} {
return histogram.NewFast()
},
}

Some files were not shown because too many files have changed in this diff Show More