Compare commits

...

94 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
8da3f773ae lib/promscrape: add -promscrape.disableKeepAlive command-line flag for disabling http keep-alive connections when scraping targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-07-01 02:20:20 +03:00
BigFish
9d5f5b6878 fix: spelling mistakes (#594)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-07-01 01:35:26 +03:00
Aliaksandr Valialkin
9a2ba5b6d1 vendor: make vendor-update 2020-07-01 01:04:58 +03:00
Aliaksandr Valialkin
b277ba8121 lib/httpserver: add Unwrap method to ErrorWithStatusCode, so As and Is functions in standard errors package may properly unwrap the error inside ErrorWithStatusCode 2020-07-01 00:54:01 +03:00
Aliaksandr Valialkin
84a37098ed app/vmstorage: add -denyQueriesOutsideRetention command-line flag for denying queries outside the configured retention
VictoriaMetrics returns `503 Service Unavailable` http error for requests with time ranges outside the configured retention
if `-denyQueriesOutsideRetention` command-line flag is set.
2020-07-01 00:21:44 +03:00
Aliaksandr Valialkin
56ccfa5218 all: use errors.As instead of type assertion for detecting net.Error 2020-07-01 00:15:34 +03:00
Aliaksandr Valialkin
7c2c8b2981 all: use errors.As for inspecting errors that implement httpserver.ErrorWithStatusCode 2020-07-01 00:04:34 +03:00
Aliaksandr Valialkin
d5dddb0953 all: use %w instead of %s for wrapping errors in fmt.Errorf
This will simplify examining the returned errors such as httpserver.ErrorWithStatusCode .
See https://blog.golang.org/go1.13-errors for details.
2020-06-30 23:05:11 +03:00
Aliaksandr Valialkin
586c5be404 lib/promscrape: add missing label sorting for autogenerated metrics
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/592
2020-06-29 22:36:12 +03:00
Ween
1cd01b5359 Fix Auto metrics relabeled errors (#593)
* Fix Auto metrics relabeled errors

* Finalize auto-genenated  Labels

* Fix Test Errors

Co-authored-by: xinyulong <xinyulong@kuaishou.com>
2020-06-29 22:29:29 +03:00
Roman Khavronenko
88538df267 app/vmalert: support multiple notifier urls (#584) (#590)
* app/vmalert: support multiple notifier urls (#584)

User now can set multiple notifier URLs in the same fashion
as for other vmutils (e.g. vmagent). The same is correct for
TLS setting for every configured URL. Alerts sending is done
in sequential way for respecting the specified URLs order.

* app/vmalert: add basicAuth support for notifier client (#585)

The change adds possibility to set basicAuth creds for notifier
client in the same fasion as for remote write/read and datasource.
2020-06-29 22:21:03 +03:00
Aliaksandr Valialkin
63e5ee0d29 docs: sync with upstream 2020-06-29 22:09:03 +03:00
Roman Khavronenko
eba4e92994 deployment/docker: replace Prometheus with vmagent (#589)
vmagent replaces Prometheus to perform scrapes and writes
into VictoriaMetrics installation. Prometheus datasource was
dropped, but its config was reused to feed vmagent.

Change also contains simplification in dashboard propagation
to Grafana container by removing excessive json manipulation
steps.
2020-06-29 22:05:34 +03:00
Roman Khavronenko
82ecfa3b32 app/vmalert: move flags description and initialization into subpackages
The change adds no new functionality and aims to move flags definitions
to subpackages that are using them. This should improve readability
of the main function.
2020-06-28 12:26:22 +01:00
kreedom
dc4e3f0e0b app/vmalert: properly set transport for HTTP clients
Fixes issue #586
2020-06-27 08:31:54 +01:00
Aliaksandr Valialkin
8f2e88234f docs: update the info that docker images are built on top of alpine image now
A follow-up after the commit ff624c9125
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/522
2020-06-26 13:54:10 +03:00
Aliaksandr Valialkin
423825695f vendor: make vendor-update 2020-06-25 23:45:14 +03:00
Aliaksandr Valialkin
5dc0bf6d3d vendor: update github.com/valyala/fastjson from v1.5.1 to v1.5.2 2020-06-25 23:35:03 +03:00
Aliaksandr Valialkin
7eb171182b lib/promrelabel: properly apply ^ and $ anchors to regex value in Prometheus relabeling rules 2020-06-25 17:19:19 +03:00
Aliaksandr Valialkin
05d754d7bb app/vmselect/netstorage: reset big result values every 10 seconds instead of after processing every time series
This should reduce GC pressure when processing time series with big number of rows
2020-06-24 19:38:39 +03:00
Aliaksandr Valialkin
8dec17470d deployment/docker/docker-compose.yml: update Prometheus from v1.18.1 to v1.19.1 and Grafana from v7.0.2 to v7.0.3 2020-06-24 18:09:33 +03:00
Aliaksandr Valialkin
5e35b87c3d docs/Cluster-VictoriaMetrics.md: move VictoriaMetrics logo below "Cluster version" heading, since it is heeded for proper navigation at https://victoriametrics.github.io 2020-06-24 12:06:27 +03:00
Aliaksandr Valialkin
c85d926569 docs/SampleSizeCalculations.md: updates 2020-06-24 12:06:25 +03:00
Aliaksandr Valialkin
f0cef4761b docs/SampleSizeCalculations.md: add a doc with calculations for the "Lowest sample size" graph at https://victoriametrics.com/ 2020-06-24 12:00:22 +03:00
nicbaz
774f7ca1c1 vmselect: fix label_replace when mismatch (#579)
As per documentation on `label_replace` function: "If the regular
expression doesn't match then the timeseries is returned unchanged".

Currently this behavior is not enforced, if a regexp on an existing
tag doesn't match then the tag value is copied as-is in the destination
tag. This fix first checks that the regular expression matches the
source tag before applying anything.

Given the current implementation, this fix also changes the behavior
of the **MetricsQL** `label_transform` function which does not
document this behavior at the moment.
2020-06-23 23:50:33 +03:00
Aliaksandr Valialkin
a560b4788e lib/fs: go fmt 2020-06-23 23:02:39 +03:00
Aliaksandr Valialkin
8141541e61 lib/fs: fall back to cgo copy for copying the last 4KB of mmaped data
This probably should fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/581
2020-06-23 22:55:22 +03:00
Aliaksandr Valialkin
e65b4cb6b1 docs/vmalert.md: sync with app/vmalert/README.md 2020-06-23 22:49:38 +03:00
Aliaksandr Valialkin
7209d58fbd app/vmselect/netstorage: increase concurrency when processing small number of time series with big number of data points per each time series
Previously VictoriaMetrics was processing up to 32 time series in a single goroutine.
This could be slow if each time series contains big number of data points (10M+ or more), since only a single CPU core could be loaded with work,
while other CPU cores were idle. Fix this by launching GOMAXPROCS workers for time series processing.

This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/572
2020-06-23 22:46:15 +03:00
nicbaz
72c90bfd8b vmalert: add support for TLS configuration (#578)
app/vmalert: add support for TLS configuration

Add support for TLS optional configuration in a similar fashion to what
is currently supported in other vmutils such as vmagent. TLS
configuration options are distinct for datasource, remoteRead,
remoteWrite as well as notifier.
2020-06-23 20:45:45 +01:00
Aliaksandr Valialkin
2a39ba639d lib/promrelabel: add support for keep_if_equal and drop_if_equal actions to relabel configs
These actions may be useful for filtering out unneeded targets and/or metrics if they contain equal label values.
For example, the following rule would leave the target only if __meta_kubernetes_annotation_prometheus_io_port
equals __meta_kubernetes_pod_container_port_number:

  - action: keep_if_equal
    source_labels: [__meta_kubernetes_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
2020-06-23 17:29:03 +03:00
Aliaksandr Valialkin
8f0bcec6cc lib/promscrape: preserve the previously discovered targets on discovery errors per each job_name
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/582
2020-06-23 15:40:40 +03:00
Aliaksandr Valialkin
a13cd60c6f vendor: update github.com/klauspost/compress from v1.10.9 to v1.10.10 2020-06-23 13:48:51 +03:00
Aliaksandr Valialkin
c970cb912c lib/fs: an attempt to fix SIGBUS error by rounding mmap`ed region to multiple of 4KB pages
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/581
2020-06-23 13:39:49 +03:00
Aliaksandr Valialkin
b5206ce33f lib/logger: add -loggerErrorsPerSecondLimit for limiting the rate of ERROR messages 2020-06-23 12:41:36 +03:00
Aliaksandr Valialkin
4c7f216dfe lib/promscrape: retry performing the request to the server for up to 3 times before giving up when it closes keep-alive connections
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-06-23 12:33:54 +03:00
Aliaksandr Valialkin
530f7a21e8 docs/Single-server-VictoriaMetrics.md: remove -httpListenAddr command-line flag from setting up VictoriaMetrics chapter
This flag is optional and it has good default value - `:8428`, so there is no need in mentioning it at this chapter
2020-06-22 12:45:20 +03:00
Aliaksandr Valialkin
7532dbcdf5 app/vmselect/promql: properly override label values from group_left and group_right lists like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/577
2020-06-21 16:33:01 +03:00
kreedom
7ec6711f06 Support of custom URL path for alert (#560)
app/vmalert: Support custom URL for alerts source

Add flag `external.alert.source` for configuring custom URL
for alert's source. This may be handy to re-point default source
URL to other systems like Grafana.
Updates #517
2020-06-21 11:32:46 +01:00
Aliaksandr Valialkin
e149019c00 lib/promscrape/discovery/consul: reduce load on Consul when discovering big number of targets by using background caching
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-06-20 18:20:01 +03:00
Aliaksandr Valialkin
7bf2cbad32 lib/promscrape: reduce default value for -promscrape.discovery.concurrency from 500 to 100
This should reduce load on Kubernetes API server and Consul when big number of targets are discovered

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-06-20 17:53:42 +03:00
Aliaksandr Valialkin
6ff821c70d lib/promscrape/discovery/ec2: expose __meta_ec2_ami like the next Prometheus release will do
See b5d61fb66c for details
2020-06-20 17:45:23 +03:00
Aliaksandr Valialkin
a43be95e83 docs/Cluster-VictoriaMetrics.md: add high availability chapter 2020-06-20 15:53:38 +03:00
Aliaksandr Valialkin
f689164711 docs/Single-server-VictoriaMetrics.md: mention that vmauth could be used for routing user requests to particular VictoriaMetrics instances 2020-06-19 16:16:47 +03:00
Aliaksandr Valialkin
7976ec8bb1 docs/Single-server-VictoriaMetrics.md: add a link to features available for enterprise customers 2020-06-19 13:17:01 +03:00
Aliaksandr Valialkin
9f3e3a4d7a docs/vmauth.md: mention that we can provide custom integration with SAML 2020-06-19 13:14:07 +03:00
Aliaksandr Valialkin
57acbf5491 vendor: make vendor-update 2020-06-19 02:39:53 +03:00
Aliaksandr Valialkin
5820c0ffb7 lib/promrelabel: allows regex capture groups in target_label like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/569
2020-06-19 02:21:24 +03:00
Tristan Su
ac3700ed1e lib/storage: set big/small merge concurrency (#568)
fixed #567

Co-authored-by: Tristan Su <suqing.sq@alibaba-inc.com>
2020-06-19 01:25:48 +03:00
Aliaksandr Valialkin
b542e50680 app/vminsert: export metrics for determining ingested rows with dropped or truncated labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/565
2020-06-19 01:10:37 +03:00
Aliaksandr Valialkin
818abca8f1 make docs-sync 2020-06-18 23:55:28 +03:00
Aliaksandr Valialkin
50c0d8c17d docs/Articles.md: add a link to article https://stas.starikevich.com/posts/raspberry-pi-4-prometheus/ 2020-06-18 23:14:04 +03:00
Aliaksandr Valialkin
88e1b7d144 app/vmselect/promql: fill gaps on right side with values from left side of or operator in the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/552
2020-06-18 23:05:16 +03:00
Aliaksandr Valialkin
08495360b0 lib/storage: add key!=".+" filter additionally to negative filter matching empty value such as key!~"|foo"
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2020-06-18 20:03:48 +03:00
Aliaksandr Valialkin
a12364ad37 docs/vmbackup.md: mention that backups from single-node and cluster versions are incompatible 2020-06-18 18:52:43 +03:00
Roman Khavronenko
e91d758831 vmalert-537: allow name duplication for rules within one group. (#559)
Uniqueness of rule is now defined by combination of its name, expression and
labels. The hash of the combination is now used as rule ID and identifies rule within the group.

Set of rules from coreos/kube-prometheus was added for testing purposes to
verify compatibility. The check also showed that `vmalert` doesn't support
`query` template function that was mentioned as limitation in README.
2020-06-15 20:15:47 +01:00
Roman Khavronenko
3d63a79b91 dashboard: update single-version dashboard. (#557)
The change contains two updates:
* fix Datapoints panel query #551;
* add datasource selector.
2020-06-12 22:05:05 +03:00
Aliaksandr Valialkin
e53ee763f9 docs/vmalert.md: sync with app/vmalert/README.md 2020-06-10 19:38:24 +03:00
Aliaksandr Valialkin
ae1cc0fc4b lib/storage: properly match {tag!="|foo"} filters
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2020-06-10 19:35:56 +03:00
Clémence Saussez
e426434770 app/vmalert: fix link to testdata (#547)
Fix broken link to vmalert test data
Signed-off-by: Clemence Saussez <clemence@zen.ly>
2020-06-10 17:26:27 +01:00
Roman Khavronenko
3e277020a5 vmalert-491: allow to configure concurrent rules execution per group. (#542)
The feature allows to speed up group rules execution by
executing them concurrently.

Change also contains README changes to reflect configuration
details.
2020-06-09 15:21:20 +03:00
Roman Khavronenko
ffa75c423d vmalert-521: allow to disable rules expression validation. (#536)
This feature may be useful for using `vmalert` with PromQL
compatible datasources like Loki.
2020-06-06 21:27:09 +01:00
Aliaksandr Valialkin
0bba630f55 vendor: make vendor-update 2020-06-06 00:01:19 +03:00
Aliaksandr Valialkin
2382053d32 vendor: update github.com/klauspost/compress from v1.10.7 to v1.10.8 2020-06-05 23:51:35 +03:00
Aliaksandr Valialkin
69a647b0d2 lib/httpserver: do not flush and do not close gzip writer if response compression is disabled
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
2020-06-05 21:37:28 +03:00
Aliaksandr Valialkin
f5dd2a71a6 app/vmauth: disable automatic response compression/uncompression, since it may work improperly in some cases
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
2020-06-05 20:13:56 +03:00
Aliaksandr Valialkin
4b98e436ef app/vmauth: emit fatal errors instead of panics when incorrect command-line flags are set 2020-06-05 20:13:55 +03:00
Aliaksandr Valialkin
4e8d6b80e0 lib/backup: properly create missing parent directories in fs.CreateFile 2020-06-05 19:28:19 +03:00
Aliaksandr Valialkin
d120197676 lib/fs: optimize queries that read recent samples for big number of time series
Use standard copy() func instead of mmap-aware copy func for reading recently touched mmap-ed data.
This improves read performance by up to 4x.
2020-06-05 19:10:04 +03:00
Aliaksandr Valialkin
4cb3af1a36 lib/fs: add a benchmark for ReaderAt.MustReadAt 2020-06-05 19:10:03 +03:00
Aliaksandr Valialkin
0d92abfbf6 app/vmalert: print brief usage info for vmalert -help 2020-06-05 10:43:18 +03:00
Aliaksandr Valialkin
ff1a725a56 app/vmauth: print brief usage info for vmauth -help 2020-06-05 10:40:00 +03:00
Aliaksandr Valialkin
05ae1472e3 app/vmagent: print brief usage info for vmagent -help 2020-06-05 10:39:59 +03:00
Aliaksandr Valialkin
4fd3f6f991 lib/backup/fsremote: create all the parent directories before creating file in CreateFile 2020-06-05 10:25:10 +03:00
Aliaksandr Valialkin
6281549f31 docs/Cluster-VictoriaMetrics.md: remove obsolete line 2020-06-04 20:21:37 +03:00
Aliaksandr Valialkin
9f4e86ac2f docs/Cluster-VictoriaMetrics.md: update stale info about replication 2020-06-04 20:21:36 +03:00
Aliaksandr Valialkin
af49c5bdf6 deployment/docker: update Go builder from v1.14.3 to v1.14.4
This fixes the following issue in Go runtime, which could result in program hang - https://github.com/golang/go/issues/38931
2020-06-04 18:07:04 +03:00
Aliaksandr Valialkin
a47a05dfd2 docs/Cluster-VictoriaMetrics.md: clarify simultaneous usage of replication and deduplication 2020-06-04 18:01:32 +03:00
Aliaksandr Valialkin
3d4008263f lib/fs: optimize MustGetFreeSpace performance by caching the results for up to 2 seconds 2020-06-04 13:15:47 +03:00
Vyacheslav Mitrofanov
72ff05255f allow to use values lower than 10 with the flag -memory.allowedPercent (#531)
Co-authored-by: Vyacheslav Mitrofanov <vmitrofanov@mfms.ru>
2020-06-03 23:39:29 +03:00
Denis
a99d606220 Update docker-compose.yml (#530)
Update to latest version of Prometheus & Grafana.
2020-06-03 23:37:23 +03:00
Aliaksandr Valialkin
f8692a1d43 app/vmauth: log when -auth.config is reloaded in SIGHUP 2020-06-03 23:22:14 +03:00
Aliaksandr Valialkin
78b28a03b6 docs/Single-server-VictoriaMetrics.md: clarify Replication section 2020-06-03 21:32:42 +03:00
Aliaksandr Valialkin
854f40acf2 docs/FAQ.md: add a question about the difference between vmagent and Prometheus 2020-06-03 20:56:22 +03:00
Aliaksandr Valialkin
6d059b28bf docs/Cluster-VictoriaMetrics.md: update Replication and data safety chapter 2020-06-03 20:24:58 +03:00
Aliaksandr Valialkin
f26b94cfb6 docs/Single-server-VictoriaMetrics.md: mention vmagent in high availability section 2020-06-03 20:16:02 +03:00
Aliaksandr Valialkin
937338abdf lib/bytesutil: prevent from garbage collecting s before returning from ToUnsafeBytes 2020-06-03 00:23:40 +03:00
Aliaksandr Valialkin
f2b04f2efe vendor: update github.com/klauspost/compress from v1.10.6 to v1.10.7 2020-06-03 00:10:44 +03:00
Aliaksandr Valialkin
ff624c9125 deployment/docker: use alpine base image for docker images in order to improve debuggability
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/522
2020-06-02 22:43:19 +03:00
Aliaksandr Valialkin
13b1358c07 docs: update FAQ.md 2020-06-02 19:58:07 +03:00
Aliaksandr Valialkin
f7ff809c1e docs/vmalert.md: sync with app/vmalert/README.md via make docs-sync 2020-06-02 19:15:15 +03:00
Aliaksandr Valialkin
ab6e994bab Makefile: add make docs-sync command for syncing docs contents 2020-06-02 19:15:14 +03:00
Aliaksandr Valialkin
d2f30e8d79 app/vmalert: fix comment for UpdateWith exported methods 2020-06-01 14:35:32 +03:00
Roman Khavronenko
270552fde4 vmalert: Add recording rules support. (#519)
* vmalert: Add recording rules support.

Recording rules support required additional service refactoring since
it wasn't planned to support them from the very beginning. The list
of changes is following:
* new entity RecordingRule was added for writing results of MetricsQL
expressions into remote storage;
* interface Rule now unites both recording and alerting rules;
* configuration parser was moved to separate package and now performs
more strict validation;
* new endpoint for listing all groups and rules in json format was added;
* evaluation interval may be set to every particular group;

* vmalert: uncomment tests

* vmalert: rm outdated TODO

* vmalert: fix typos in README
2020-06-01 13:46:37 +03:00
592 changed files with 24980 additions and 50686 deletions

View File

@@ -145,3 +145,11 @@ golangci-lint: install-golangci-lint
install-golangci-lint:
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
docs-sync:
cp app/vmagent/README.md docs/vmagent.md
cp app/vmalert/README.md docs/vmalert.md
cp app/vmauth/README.md docs/vmauth.md
cp app/vmbackup/README.md docs/vmbackup.md
cp app/vmrestore/README.md docs/vmrestore.md
cp README.md docs/Single-server-VictoriaMetrics.md

View File

@@ -21,6 +21,7 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
## Case studies and talks
@@ -149,9 +150,9 @@ The following command-line flags are used the most:
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
Other flags have good enough default values, so set them only if you really need this.
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
Pass `-help` to see all the available flags with description and default values.
@@ -582,17 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
ROOT_IMAGE=scratch make package-victoria-metrics
```
### Start with docker-compose
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
### Setting up service
@@ -774,7 +776,13 @@ The required resources for query path:
### High availability
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
2) Add addresses of these instances to `remote_write` section in Prometheus config:
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
```bash
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
```
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
```yml
remote_write:
@@ -834,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
so it could route requests from particular user to VictoriaMetrics with the desired retention.
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
### Downsampling
There is no downsampling support at the moment, but:
@@ -981,6 +994,10 @@ The most interesting metrics are:
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
while `topN` equals to 10.
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
### Backfilling
@@ -1000,15 +1017,12 @@ for data with timestamps close to the current time.
### Replication
Single-node VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
See [backup docs](#backups) for details.
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
Cluster version of VictoriaMetrics supports replication. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety)
for details.
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
See also [high availability docs](#high-availability) and [backup docs](#backups).
### Backups

View File

@@ -170,6 +170,8 @@ Additionally it provides the following extra actions:
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
The relabeling can be defined in the following places:
@@ -210,6 +212,14 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
```yml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
### How to build from sources
@@ -234,11 +244,11 @@ Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmagent
ROOT_IMAGE=scratch make package-vmagent
```

View File

@@ -54,6 +54,7 @@ var (
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
@@ -205,3 +206,15 @@ var (
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
)
func usage() {
const s = `
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
}

View File

@@ -162,7 +162,7 @@ func getTLSConfig(argIdx int) (*tls.Config, error) {
}
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
if err != nil {
return nil, fmt.Errorf("cannot populate TLS config: %s", err)
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
}
tlsCfg := cfg.NewTLSConfig()
return tlsCfg, nil

View File

@@ -33,7 +33,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
if *relabelConfigPathGlobal != "" {
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
if err != nil {
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
}
rcs.global = global
}
@@ -45,7 +45,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
for i, path := range *relabelConfigPaths {
prc, err := promrelabel.LoadRelabelConfigs(path)
if err != nil {
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", path, err)
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
}
rcs.perURL[i] = prc
}

View File

@@ -55,11 +55,13 @@ test-vmalert:
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
go test -v -race -cover ./app/vmalert/datasource
go test -v -race -cover ./app/vmalert/notifier
go test -v -race -cover ./app/vmalert/config
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 \
-notifier.url=http://127.0.0.1:9093 \
-remoteWrite.url=http://localhost:8428 \
-remoteRead.url=http://localhost:8428 \
-evaluationInterval=3s

View File

@@ -1,19 +1,27 @@
## VM Alert
## vmalert
`vmalert` executes a list of given MetricsQL expressions (rules) and
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
rules against configured address.
### Features:
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
expressions validation;
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Lightweight without extra dependencies.
### TODO:
* Support recording rules.
### Limitations:
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
may fail;
* by default, rules execution is sequential within one group, but persisting of execution results to remote
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
recording rule is reused in next one;
* there is no `query` function support in templates yet;
* `vmalert` has no UI, just an API for getting groups and rules statuses.
### QuickStart
@@ -26,10 +34,12 @@ make vmalert
The build binary will be placed to `VictoriaMetrics/bin` folder.
To start using `vmalert` you will need the following things:
* list of alert rules - PromQL/MetricsQL expressions to execute;
* list of rules - PromQL/MetricsQL expressions to execute;
* datasource address - reachable VictoriaMetrics instance for rules execution;
* notifier address - reachable Alertmanager instance for processing,
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
aggregating alerts and sending notifications.
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
Then configure `vmalert` accordingly:
```
@@ -38,22 +48,106 @@ Then configure `vmalert` accordingly:
-notifier.url=http://localhost:9093
```
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
```yaml
groups:
[ - <rule_group> ]
```
`vmalert` runs evaluation for every group in a separate goroutine.
Rules in group evaluated one-by-one sequentially.
#### Groups
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
Each group has following attributes:
```yaml
# The name of the group. Must be unique within a file.
name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = global.evaluation_interval ]
# How many rules execute at once. Increasing concurrency may speed
# up round execution speed.
[ concurrency: <integer> | default = 1 ]
rules:
[ - <rule> ... ]
```
#### Rules
There are two types of Rules:
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allow you to precompute frequently needed or computationally expensive expressions
and save their result as a new set of time series.
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
within one group.
##### Alerting rules
The syntax for alerting rule is following:
```yaml
# The name of the alert. Must be a valid metric name.
alert: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
[ for: <duration> | default = 0s ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]
# Annotations to add to each alert.
annotations:
[ <labelname>: <tmpl_string> ]
```
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
address in form of timeseries with name `ALERTS` via remote-write protocol.
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
address by querying `ALERTS` timeseries.
##### Recording rules
The syntax for recording rules is following:
```yaml
# The name of the time series to output to. Must be a valid metric name.
record: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Labels to add or overwrite before storing the result.
labels:
[ <labelname>: <labelvalue> ]
```
For recording rules to work `-remoteWrite.url` must specified.
#### WEB
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
### Configuration
@@ -64,22 +158,36 @@ Usage of vmalert:
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
-datasource.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
-datasource.tlsServerName value
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-evaluationInterval duration
How often to evaluate the rules. Default 1m (default 1m0s)
How often to evaluate the rules (default 1m0s)
-external.url string
External URL is used as alert's source for sent alerts to the notifier
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-notifier.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
-notifier.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
-notifier.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -notifier.url
-notifier.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
-notifier.tlsServerName value
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
-remoteRead.basicAuth.password string
@@ -88,14 +196,40 @@ Usage of vmalert:
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
-remoteRead.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
-remoteRead.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
-remoteRead.tlsServerName value
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
-remoteWrite.maxQueueSize
Defines the max number of pending datapoints to remote write endpoint
-remoteWrite.concurrency int
Defines number of readers that concurrently write into remote storage (default 1)
-remoteWrite.flushInterval duration
Defines interval of flushes to remote write endpoint (default 5s)
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
-remoteWrite.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
-remoteWrite.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
-remoteWrite.tlsServerName value
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
@@ -105,8 +239,10 @@ Usage of vmalert:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Indicates to validate annotation and label templates (default true)
Whether to validate annotation and label templates (default true)
```
Pass `-help` to `vmalert` in order to see the full list of supported

374
app/vmalert/alerting.go Normal file
View File

@@ -0,0 +1,374 @@
package main
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strconv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// AlertingRule is basic alert entity
type AlertingRule struct {
RuleID uint64
Name string
Expr string
For time.Duration
Labels map[string]string
Annotations map[string]string
GroupID uint64
// guard status fields
mu sync.RWMutex
// stores list of active alerts
alerts map[uint64]*notifier.Alert
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
}
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
return &AlertingRule{
RuleID: cfg.ID,
Name: cfg.Alert,
Expr: cfg.Expr,
For: cfg.For,
Labels: cfg.Labels,
Annotations: cfg.Annotations,
GroupID: gID,
alerts: make(map[uint64]*notifier.Alert),
}
}
// String implements Stringer interface
func (ar *AlertingRule) String() string {
return ar.Name
}
// ID returns unique Rule ID
// within the parent Group.
func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// Exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := q.Query(ctx, ar.Expr)
ar.mu.Lock()
defer ar.mu.Unlock()
ar.lastExecError = err
ar.lastExecTime = time.Now()
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive {
delete(ar.alerts, h)
}
}
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
h := hash(m)
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
if a.Value != m.Value {
// update Value field with latest value
a.Value = m.Value
// and re-exec template since Value can be used
// in templates
err = ar.template(a)
if err != nil {
return nil, err
}
}
continue
}
a, err := ar.newAlert(m, ar.lastExecTime)
if err != nil {
ar.lastExecError = err
return nil, fmt.Errorf("failed to create alert: %w", err)
}
a.ID = h
a.State = notifier.StatePending
ar.alerts[h] = a
}
for h, a := range ar.alerts {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending {
// alert was in Pending state - it is not
// active anymore
delete(ar.alerts, h)
continue
}
a.State = notifier.StateInactive
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
}
if series {
return ar.toTimeSeries(ar.lastExecTime), nil
}
return nil, nil
}
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
ts := ar.alertToTimeSeries(a, timestamp)
tss = append(tss, ts...)
}
return tss
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
return nil
}
// TODO: consider hashing algorithm in VM
func hash(m datasource.Metric) uint64 {
hash := fnv.New64a()
labels := m.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
a := &notifier.Alert{
GroupID: ar.GroupID,
Name: ar.Name,
Labels: map[string]string{},
Value: m.Value,
Start: start,
Expr: ar.Expr,
}
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
a.Labels[l.Name] = l.Value
}
return a, ar.template(a)
}
func (ar *AlertingRule) template(a *notifier.Alert) error {
// 1. template rule labels with data labels
rLabels, err := a.ExecTemplate(ar.Labels)
if err != nil {
return err
}
// 2. merge data labels and rule labels
// metric labels may be overridden by
// rule labels
for k, v := range rLabels {
a.Labels[k] = v
}
// 3. template merged labels
a.Labels, err = a.ExecTemplate(a.Labels)
if err != nil {
return err
}
a.Annotations, err = a.ExecTemplate(ar.Annotations)
return err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.mu.RLock()
defer ar.mu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// RuleAPI returns Rule representation in form
// of APIAlertingRule
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
var lastErr string
if ar.lastExecError != nil {
lastErr = ar.lastExecError.Error()
}
return APIAlertingRule{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
Name: ar.Name,
Expression: ar.Expr,
For: ar.For.String(),
LastError: lastErr,
LastExec: ar.lastExecTime,
Labels: ar.Labels,
Annotations: ar.Annotations,
}
}
// AlertsAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
var alerts []*APIAlert
ar.mu.RLock()
for _, a := range ar.alerts {
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.mu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
return &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.Start,
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
}
}
const (
// AlertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
// AlertForStateMetricName is the metric name for 'for' state of alert.
alertForStateMetricName = "ALERTS_FOR_STATE"
// AlertNameLabel is the label name indicating the name of an alert.
alertNameLabel = "alertname"
// AlertStateLabel is the label name indicating the state of an alert.
alertStateLabel = "alertstate"
)
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
if ar.For > 0 {
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
}
return tss
}
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertMetricName
labels[alertNameLabel] = name
labels[alertStateLabel] = a.State.String()
return newTimeSeries(1, labels, timestamp)
}
// alertForToTimeSeries returns a timeseries that represents
// state of active alerts, where value is time when alert become active
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertForStateMetricName
labels[alertNameLabel] = name
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
// Only rules with For > 0 will be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
// Get the last datapoint in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
}
for _, m := range qMetrics {
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Exec
for _, l := range labels {
if l.Name == alertNameLabel {
continue
}
// drop all overridden labels
if _, ok := ar.Labels[l.Name]; ok {
continue
}
m.Labels = append(m.Labels, l)
}
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
if err != nil {
return fmt.Errorf("failed to create alert: %w", err)
}
a.ID = hash(m)
a.State = notifier.StatePending
ar.alerts[a.ID] = a
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
}
return nil
}

View File

@@ -2,7 +2,6 @@ package main
import (
"context"
"sync"
"testing"
"time"
@@ -11,30 +10,15 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestRule_Validate(t *testing.T) {
if err := (&Rule{}).Validate(); err == nil {
t.Errorf("exptected empty name error")
}
if err := (&Rule{Name: "alert"}).Validate(); err == nil {
t.Errorf("exptected empty expr error")
}
if err := (&Rule{Name: "alert", Expr: "test{"}).Validate(); err == nil {
t.Errorf("exptected invalid expr error")
}
if err := (&Rule{Name: "alert", Expr: "test>0"}).Validate(); err != nil {
t.Errorf("exptected valid rule got %s", err)
}
}
func TestRule_AlertToTimeSeries(t *testing.T) {
func TestAlertingRule_ToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *Rule
rule *AlertingRule
alert *notifier.Alert
expTS []prompbmarshal.TimeSeries
}{
{
newTestRule("instant", 0),
newTestAlertingRule("instant", 0),
&notifier.Alert{State: notifier.StateFiring},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
@@ -45,7 +29,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
},
},
{
newTestRule("instant extra labels", 0),
newTestAlertingRule("instant extra labels", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
"job": "foo",
"instance": "bar",
@@ -61,7 +45,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
},
},
{
newTestRule("instant labels override", 0),
newTestAlertingRule("instant labels override", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
alertStateLabel: "foo",
"__name__": "bar",
@@ -75,7 +59,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
},
},
{
newTestRule("for", time.Second),
newTestAlertingRule("for", time.Second),
&notifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
@@ -90,7 +74,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
},
},
{
newTestRule("for pending", 10*time.Second),
newTestAlertingRule("for pending", 10*time.Second),
&notifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
@@ -107,58 +91,29 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
tss := tc.rule.AlertToTimeSeries(tc.alert, timestamp)
if len(tc.expTS) != len(tss) {
t.Fatalf("expected number of timeseries %d; got %d", len(tc.expTS), len(tss))
}
for i := range tc.expTS {
expTS, gotTS := tc.expTS[i], tss[i]
if len(expTS.Samples) != len(gotTS.Samples) {
t.Fatalf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
}
for i, exp := range expTS.Samples {
got := gotTS.Samples[i]
if got.Value != exp.Value {
t.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
}
if got.Timestamp != exp.Timestamp {
t.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
}
}
if len(expTS.Labels) != len(gotTS.Labels) {
t.Fatalf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
}
for i, exp := range expTS.Labels {
got := gotTS.Labels[i]
if got.Name != exp.Name {
t.Errorf("expected label name %q; got %q", exp.Name, got.Name)
}
if got.Value != exp.Value {
t.Errorf("expected label value %q; got %q", exp.Value, got.Value)
}
}
tc.rule.alerts[tc.alert.ID] = tc.alert
tss := tc.rule.toTimeSeries(timestamp)
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func newTestRule(name string, waitFor time.Duration) *Rule {
return &Rule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
}
func TestRule_Exec(t *testing.T) {
func TestAlertingRule_Exec(t *testing.T) {
const defaultStep = 5 * time.Millisecond
testCases := []struct {
rule *Rule
rule *AlertingRule
steps [][]datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestRule("empty", 0),
newTestAlertingRule("empty", 0),
[][]datasource.Metric{},
map[uint64]*notifier.Alert{},
},
{
newTestRule("empty labels", 0),
newTestAlertingRule("empty labels", 0),
[][]datasource.Metric{
{datasource.Metric{}},
},
@@ -167,7 +122,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("single-firing", 0),
newTestAlertingRule("single-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
@@ -176,7 +131,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("single-firing=>inactive", 0),
newTestAlertingRule("single-firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
@@ -186,7 +141,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("single-firing=>inactive=>firing", 0),
newTestAlertingRule("single-firing=>inactive=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
@@ -197,7 +152,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive", 0),
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
@@ -209,7 +164,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
@@ -220,7 +175,7 @@ func TestRule_Exec(t *testing.T) {
map[uint64]*notifier.Alert{},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
@@ -234,7 +189,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("multiple-firing", 0),
newTestAlertingRule("multiple-firing", 0),
[][]datasource.Metric{
{
metricWithLabels(t, "name", "foo"),
@@ -249,7 +204,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("multiple-steps-firing", 0),
newTestAlertingRule("multiple-steps-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo1")},
@@ -264,7 +219,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("duplicate", 0),
newTestAlertingRule("duplicate", 0),
[][]datasource.Metric{
{
// metrics with the same labelset should result in one alert
@@ -277,7 +232,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending", time.Minute),
newTestAlertingRule("for-pending", time.Minute),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
@@ -286,7 +241,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-fired", time.Millisecond),
newTestAlertingRule("for-fired", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
@@ -296,7 +251,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending=>empty", time.Second),
newTestAlertingRule("for-pending=>empty", time.Second),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
@@ -306,7 +261,7 @@ func TestRule_Exec(t *testing.T) {
map[uint64]*notifier.Alert{},
},
{
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
@@ -318,7 +273,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
@@ -331,7 +286,7 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
@@ -349,15 +304,15 @@ func TestRule_Exec(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.group = fakeGroup
tc.rule.GroupID = fakeGroup.ID()
for _, step := range tc.steps {
fq.reset()
fq.add(step...)
if err := tc.rule.Exec(context.TODO(), fq); err != nil {
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
time.Sleep(time.Millisecond)
time.Sleep(defaultStep)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
@@ -375,49 +330,9 @@ func TestRule_Exec(t *testing.T) {
}
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
fq.Lock()
cpy := make([]datasource.Metric, len(fq.metrics))
copy(cpy, fq.metrics)
fq.Unlock()
return cpy, nil
}
func TestRule_Restore(t *testing.T) {
func TestAlertingRule_Restore(t *testing.T) {
testCases := []struct {
rule *Rule
rule *AlertingRule
metrics []datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
@@ -502,7 +417,7 @@ func TestRule_Restore(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.group = fakeGroup
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
t.Fatalf("unexpected err: %s", err)
@@ -526,8 +441,8 @@ func TestRule_Restore(t *testing.T) {
}
}
func newTestRuleWithLabels(name string, labels ...string) *Rule {
r := newTestRule(name, 0)
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
r := newTestAlertingRule(name, 0)
r.Labels = make(map[string]string)
for i := 0; i < len(labels); i += 2 {
r.Labels[labels[i]] = labels[i+1]
@@ -535,9 +450,6 @@ func newTestRuleWithLabels(name string, labels ...string) *Rule {
return r
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Value = value
return m
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
}

View File

@@ -1,74 +0,0 @@
package main
import (
"fmt"
"gopkg.in/yaml.v2"
"io/ioutil"
"path/filepath"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
// Parse parses rule configs from given file patterns
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
var fp []string
for _, pattern := range pathPatterns {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("error reading file patther %s:%v", pattern, err)
}
fp = append(fp, matches...)
}
var groups []Group
for _, file := range fp {
groupsNames := map[string]struct{}{}
gr, err := parseFile(file)
if err != nil {
return nil, fmt.Errorf("file %s: %w", file, err)
}
for _, g := range gr {
if _, ok := groupsNames[g.Name]; ok {
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", g.Name, file)
}
g.File = file
g.doneCh = make(chan struct{})
g.finishedCh = make(chan struct{})
g.updateCh = make(chan Group)
groupsNames[g.Name] = struct{}{}
for _, rule := range g.Rules {
if err = rule.Validate(); err != nil {
return nil, fmt.Errorf("invalid rule filepath: %s, group %s: %w", file, g.Name, err)
}
if validateAnnotations {
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
return nil, fmt.Errorf("invalid annotations filepath: %s, group %s: %w", file, g.Name, err)
}
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
return nil, fmt.Errorf("invalid labels filepath: %s, group %s: %w", file, g.Name, err)
}
}
rule.group = g
rule.alerts = make(map[uint64]*notifier.Alert)
}
groups = append(groups, g)
}
}
if len(groups) < 1 {
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}
func parseFile(path string) ([]Group, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("error reading alert rule file: %w", err)
}
g := struct {
Groups []Group `yaml:"groups"`
}{}
err = yaml.Unmarshal(data, &g)
return g.Groups, err
}

View File

@@ -0,0 +1,195 @@
package config
import (
"fmt"
"hash/fnv"
"io/ioutil"
"path/filepath"
"sort"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/metricsql"
"gopkg.in/yaml.v2"
)
// Group contains list of Rules grouped into
// entity with one name and evaluation interval
type Group struct {
File string
Name string `yaml:"name"`
Interval time.Duration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// Validate check for internal Group or Rule configuration errors
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
if g.Name == "" {
return fmt.Errorf("group name must be set")
}
if len(g.Rules) == 0 {
return fmt.Errorf("group %q can't contain no rules", g.Name)
}
uniqueRules := map[uint64]struct{}{}
for _, r := range g.Rules {
ruleName := r.Record
if r.Alert != "" {
ruleName = r.Alert
}
if _, ok := uniqueRules[r.ID]; ok {
return fmt.Errorf("rule %q duplicate", ruleName)
}
uniqueRules[r.ID] = struct{}{}
if err := r.Validate(); err != nil {
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
}
if validateExpressions {
if _, err := metricsql.Parse(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
}
}
if validateAnnotations {
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
}
if err := notifier.ValidateTemplates(r.Labels); err != nil {
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
}
}
}
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
}
// Rule describes entity that represent either
// recording rule or alerting rule.
type Rule struct {
ID uint64
Record string `yaml:"record,omitempty"`
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For time.Duration `yaml:"for,omitempty"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rule Rule
if err := unmarshal((*rule)(r)); err != nil {
return err
}
r.ID = HashRule(*r)
return nil
}
// HashRule hashes significant Rule fields into
// unique hash value
func HashRule(r Rule) uint64 {
h := fnv.New64a()
h.Write([]byte(r.Expr))
if r.Record != "" {
h.Write([]byte("recording"))
h.Write([]byte(r.Record))
} else {
h.Write([]byte("alerting"))
h.Write([]byte(r.Alert))
}
type item struct {
key, value string
}
var kv []item
for k, v := range r.Labels {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
for _, i := range kv {
h.Write([]byte(i.key))
h.Write([]byte(i.value))
h.Write([]byte("\xff"))
}
return h.Sum64()
}
// Validate check for Rule configuration errors
func (r *Rule) Validate() error {
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
return fmt.Errorf("either `record` or `alert` must be set")
}
if r.Expr == "" {
return fmt.Errorf("expression can't be empty")
}
return checkOverflow(r.XXX, "rule")
}
// Parse parses rule configs from given file patterns
func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool) ([]Group, error) {
var fp []string
for _, pattern := range pathPatterns {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
}
fp = append(fp, matches...)
}
var groups []Group
for _, file := range fp {
uniqueGroups := map[string]struct{}{}
gr, err := parseFile(file)
if err != nil {
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
}
for _, g := range gr {
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
return nil, fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err)
}
if _, ok := uniqueGroups[g.Name]; ok {
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
}
uniqueGroups[g.Name] = struct{}{}
g.File = file
groups = append(groups, g)
}
}
if len(groups) < 1 {
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}
func parseFile(path string) ([]Group, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("error reading alert rule file: %w", err)
}
g := struct {
Groups []Group `yaml:"groups"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}{}
err = yaml.Unmarshal(data, &g)
if err != nil {
return nil, err
}
return g.Groups, checkOverflow(g.XXX, "config")
}
func checkOverflow(m map[string]interface{}, ctx string) error {
if len(m) > 0 {
var keys []string
for k := range m {
keys = append(keys, k)
}
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
}
return nil
}

View File

@@ -0,0 +1,326 @@
package config
import (
"net/url"
"os"
"strings"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
func TestParseGood(t *testing.T) {
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true, true); err != nil {
t.Errorf("error parsing files %s", err)
}
}
func TestParseBad(t *testing.T) {
testCases := []struct {
path []string
expErr string
}{
{
[]string{"testdata/rules0-bad.rules"},
"unexpected token",
},
{
[]string{"testdata/dir/rules0-bad.rules"},
"error parsing annotation",
},
{
[]string{"testdata/dir/rules1-bad.rules"},
"duplicate in file",
},
{
[]string{"testdata/dir/rules2-bad.rules"},
"function \"value\" not defined",
},
{
[]string{"testdata/dir/rules3-bad.rules"},
"either `record` or `alert` must be set",
},
{
[]string{"testdata/dir/rules4-bad.rules"},
"either `record` or `alert` must be set",
},
{
[]string{"testdata/*.yaml"},
"no groups found",
},
}
for _, tc := range testCases {
_, err := Parse(tc.path, true, true)
if err == nil {
t.Errorf("expected to get error")
return
}
if !strings.Contains(err.Error(), tc.expErr) {
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
}
}
}
func TestRule_Validate(t *testing.T) {
if err := (&Rule{}).Validate(); err == nil {
t.Errorf("exptected empty name error")
}
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
t.Errorf("exptected empty expr error")
}
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
t.Errorf("exptected valid rule; got %s", err)
}
}
func TestGroup_Validate(t *testing.T) {
testCases := []struct {
group *Group
rules []Rule
validateAnnotations bool
validateExpressions bool
expErr string
}{
{
group: &Group{},
expErr: "group name must be set",
},
{
group: &Group{Name: "test"},
expErr: "contain no rules",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Record: "record",
Expr: "up | 0",
},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Record: "record",
Expr: "up | 0",
},
},
},
expErr: "invalid expression",
validateExpressions: true,
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
Labels: map[string]string{
"summary": "{{ value|query }}",
},
},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
Labels: map[string]string{
"summary": "{{ value|query }}",
},
},
},
},
expErr: "error parsing annotation",
validateAnnotations: true,
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
},
{
Alert: "alert",
Expr: "up == 1",
},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Record: "record", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Record: "record", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"description": "{{ value|query }}",
}},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "",
},
}
for _, tc := range testCases {
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
if err == nil {
if tc.expErr != "" {
t.Errorf("expected to get err %q; got nil insted", tc.expErr)
}
continue
}
if !strings.Contains(err.Error(), tc.expErr) {
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
}
}
}
func TestHashRule(t *testing.T) {
testCases := []struct {
a, b Rule
equal bool
}{
{
Rule{Record: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1"},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
"foo": "bar",
}},
true,
},
{
Rule{Alert: "record", Expr: "up == 1"},
Rule{Alert: "record", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", For: time.Minute},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},
{
Rule{Alert: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 1"},
false,
},
{
Rule{Record: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 2"},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
"foo": "baz",
}},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
}},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1"},
false,
},
}
for i, tc := range testCases {
aID, bID := HashRule(tc.a), HashRule(tc.b)
if tc.equal != (aID == bID) {
t.Fatalf("missmatch for rule %d", i)
}
}
}

View File

@@ -9,5 +9,3 @@ groups:
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,5 @@
groups:
- name: group
rules:
- for: 5m
expr: vm_rows > 0

View File

@@ -0,0 +1,7 @@
groups:
- name: group
rules:
- alert: rows
record: record
for: 5m
expr: vm_rows > 0

View File

@@ -0,0 +1,7 @@
groups:
- name: group
rules:
- alert: rows
expr: vm_rows > 0
- record: rows
expr: sum(vm_rows)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,35 @@
groups:
- name: TestGroup
interval: 2s
concurrency: 2
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1
for: 3m
annotations:
summary: "Too high connection number for {{$labels.instance}}"
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: ExampleAlertAlwaysFiring
expr: sum by(job)
(up == 1)
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
labels:
recording: true
- record: code:requests:rate5m
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
labels:
env: dev
recording: true
- record: code:requests:rate5m
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
labels:
env: staging
recording: true
- record: successful_requests:ratio_rate5m
labels:
recording: true
expr: |2
sum(code:requests:rate5m{code="200"})
/
sum(code:requests:rate5m)

View File

@@ -1,39 +0,0 @@
package main
import (
"net/url"
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
func TestParseGood(t *testing.T) {
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
t.Errorf("error parsing files %s", err)
}
}
func TestParseBad(t *testing.T) {
if _, err := Parse([]string{"testdata/rules0-bad.rules"}, true); err == nil {
t.Errorf("expected syntaxt error")
}
if _, err := Parse([]string{"testdata/dir/rules0-bad.rules"}, true); err == nil {
t.Errorf("expected template annotation error")
}
if _, err := Parse([]string{"testdata/dir/rules1-bad.rules"}, true); err == nil {
t.Errorf("expected same group error")
}
if _, err := Parse([]string{"testdata/dir/rules2-bad.rules"}, true); err == nil {
t.Errorf("expected template label error")
}
if _, err := Parse([]string{"testdata/*.yaml"}, true); err == nil {
t.Errorf("expected empty group")
}
}

View File

@@ -0,0 +1,38 @@
package datasource
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
"By default system CA is used")
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
"By default the server name from -datasource.url is used")
)
// Init creates a Querier from provided flag values.
func Init() (Querier, error) {
if *addr == "" {
flag.PrintDefaults()
return nil, fmt.Errorf("datasource.url is empty")
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &http.Client{Transport: tr}
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
}

View File

@@ -32,7 +32,7 @@ func (r response) metrics() ([]Metric, error) {
for i, res := range r.Data.Result {
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
}
m.Labels = nil
for k, v := range r.Data.Result[i].Labels {
@@ -49,9 +49,10 @@ const queryPath = "/api/v1/query?query="
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct {
c *http.Client
queryURL string
basicAuthUser, basicAuthPass string
c *http.Client
queryURL string
basicAuthUser string
basicAuthPass string
}
// NewVMStorage is a constructor for VMStorage
@@ -79,25 +80,25 @@ func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
}
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s with err %w. Reponse body %s", resp.StatusCode, req.URL, err, body)
}
r := &response{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
return nil, fmt.Errorf("error parsing metrics for %s: %w", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
if r.Data.ResultType != rtVector {
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
return nil, fmt.Errorf("unknown restul type:%s. Expected vector", r.Data.ResultType)
}
return r.metrics()
}

View File

@@ -4,26 +4,63 @@ import (
"context"
"fmt"
"hash/fnv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
)
// Group is an entity for grouping rules
type Group struct {
Name string
File string
Rules []*Rule
mu sync.RWMutex
Name string
File string
Rules []Rule
Interval time.Duration
Concurrency int
doneCh chan struct{}
finishedCh chan struct{}
// channel accepts new Group obj
// which supposed to update current group
updateCh chan Group
updateCh chan *Group
}
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
g := &Group{
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval,
Concurrency: cfg.Concurrency,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
}
if g.Interval == 0 {
g.Interval = defaultInterval
}
if g.Concurrency < 1 {
g.Concurrency = 1
}
rules := make([]Rule, len(cfg.Rules))
for i, r := range cfg.Rules {
rules[i] = g.newRule(r)
}
g.Rules = rules
return g
}
func (g *Group) newRule(rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(g.ID(), rule)
}
return newRecordingRule(g.ID(), rule)
}
// ID return unique group ID that consists of
@@ -36,48 +73,49 @@ func (g *Group) ID() uint64 {
return hash.Sum64()
}
// Restore restores alerts state for all group rules with For > 0
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
for _, rule := range g.Rules {
if rule.For == 0 {
return nil
rr, ok := rule.(*AlertingRule)
if !ok {
continue
}
if err := rule.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
if rr.For < 1 {
continue
}
if err := rr.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
return nil
}
// updateWith updates existing group with
// passed group object.
// passed group object. This function ignores group
// evaluation interval change. It supposed to be updated
// in group.start function.
// Not thread-safe.
func (g *Group) updateWith(newGroup Group) {
rulesRegistry := make(map[string]*Rule)
func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule)
for _, nr := range newGroup.Rules {
rulesRegistry[nr.id()] = nr
rulesRegistry[nr.ID()] = nr
}
for i, or := range g.Rules {
nr, ok := rulesRegistry[or.id()]
nr, ok := rulesRegistry[or.ID()]
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i] = nil
continue
}
// copy all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
or.For = nr.For
or.Expr = nr.Expr
or.Labels = nr.Labels
or.Annotations = nr.Annotations
delete(rulesRegistry, nr.id())
if err := or.UpdateWith(nr); err != nil {
return err
}
delete(rulesRegistry, nr.ID())
}
var newRules []*Rule
var newRules []Rule
for _, r := range g.Rules {
if r == nil {
// skip nil rules
@@ -89,7 +127,9 @@ func (g *Group) updateWith(newGroup Group) {
for _, nr := range rulesRegistry {
newRules = append(newRules, nr)
}
g.Concurrency = newGroup.Concurrency
g.Rules = newRules
return nil
}
var (
@@ -116,82 +156,145 @@ func (g *Group) close() {
<-g.finishedCh
}
func (g *Group) start(ctx context.Context, interval time.Duration,
querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
logger.Infof("group %q started", g.Name)
t := time.NewTicker(interval)
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
defer func() { close(g.finishedCh) }()
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
e := &executor{querier, nts, rw}
t := time.NewTicker(g.Interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
logger.Infof("group %q: context cancelled", g.Name)
close(g.finishedCh)
return
case <-g.doneCh:
logger.Infof("group %q: received stop signal", g.Name)
close(g.finishedCh)
return
case ng := <-g.updateCh:
g.updateWith(ng)
g.mu.Lock()
err := g.updateWith(ng)
if err != nil {
logger.Errorf("group %q: failed to update: %s", g.Name, err)
g.mu.Unlock()
continue
}
if g.Interval != ng.Interval {
g.Interval = ng.Interval
t.Stop()
t = time.NewTicker(g.Interval)
}
g.mu.Unlock()
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
case <-t.C:
iterationTotal.Inc()
iterationStart := time.Now()
for _, rule := range g.Rules {
execTotal.Inc()
execStart := time.Now()
err := rule.Exec(ctx, querier)
execDuration.UpdateDuration(execStart)
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
for err := range errs {
if err != nil {
execErrors.Inc()
logger.Errorf("failed to execute rule %q.%q: %s", g.Name, rule.Name, err)
continue
}
var alertsToSend []notifier.Alert
for _, a := range rule.alerts {
switch a.State {
case notifier.StateFiring:
// set End to execStart + 3 intervals
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = execStart.Add(3 * interval)
alertsToSend = append(alertsToSend, *a)
pushToRW(rw, rule, a, execStart)
case notifier.StatePending:
pushToRW(rw, rule, a, execStart)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = execStart
alertsToSend = append(alertsToSend, *a)
}
}
if len(alertsToSend) == 0 {
continue
}
alertsSent.Add(len(alertsToSend))
if err := nr.Send(ctx, alertsToSend); err != nil {
alertsSendErrors.Inc()
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
logger.Errorf("group %q: %s", g.Name, err)
}
}
iterationDuration.UpdateDuration(iterationStart)
}
}
}
func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
if rw == nil {
return
type executor struct {
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
}
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
res := make(chan error, len(rules))
var returnSeries bool
if e.rw != nil {
returnSeries = true
}
tss := rule.AlertToTimeSeries(a, timestamp)
remoteWriteSent.Add(len(tss))
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
if concurrency == 1 {
// fast path
for _, rule := range rules {
res <- e.exec(ctx, rule, returnSeries, interval)
}
close(res)
return res
}
sem := make(chan struct{}, concurrency)
go func() {
wg := sync.WaitGroup{}
for _, rule := range rules {
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, returnSeries, interval)
<-sem
wg.Done()
}(rule)
}
wg.Wait()
close(res)
}()
return res
}
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
execTotal.Inc()
execStart := time.Now()
defer func() {
execDuration.UpdateDuration(execStart)
}()
tss, err := rule.Exec(ctx, e.querier, returnSeries)
if err != nil {
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
}
if len(tss) > 0 && e.rw != nil {
remoteWriteSent.Add(len(tss))
for _, ts := range tss {
if err := e.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
}
}
}
ar, ok := rule.(*AlertingRule)
if !ok {
return nil
}
var alerts []notifier.Alert
for _, a := range ar.alerts {
switch a.State {
case notifier.StateFiring:
// set End to execStart + 3 intervals
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = time.Now().Add(3 * interval)
alerts = append(alerts, *a)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = time.Now()
alerts = append(alerts, *a)
}
}
if len(alerts) < 1 {
return nil
}
alertsSent.Add(len(alerts))
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers {
if err := nt.Send(ctx, alerts); err != nil {
alertsSendErrors.Inc()
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
}
}
return errGr.Err()
}

View File

@@ -2,33 +2,31 @@ package main
import (
"context"
"reflect"
"sort"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestUpdateWith(t *testing.T) {
testCases := []struct {
name string
currentRules []*Rule
// rules must be sorted by name
newRules []*Rule
currentRules []config.Rule
newRules []config.Rule
}{
{
"new rule",
[]*Rule{},
[]*Rule{{Name: "bar"}},
nil,
[]config.Rule{{Alert: "bar"}},
},
{
"update rule",
[]*Rule{{
Name: "foo",
Expr: "up > 0",
For: time.Second,
"update alerting rule",
[]config.Rule{{
Alert: "foo",
Expr: "up > 0",
For: time.Second,
Labels: map[string]string{
"bar": "baz",
},
@@ -37,10 +35,10 @@ func TestUpdateWith(t *testing.T) {
"description": "{{$labels}}",
},
}},
[]*Rule{{
Name: "bar",
Expr: "up > 10",
For: time.Second,
[]config.Rule{{
Alert: "foo",
Expr: "up > 10",
For: time.Second,
Labels: map[string]string{
"baz": "bar",
},
@@ -49,56 +47,96 @@ func TestUpdateWith(t *testing.T) {
},
}},
},
{
"update recording rule",
[]config.Rule{{
Record: "foo",
Expr: "max(up)",
Labels: map[string]string{
"bar": "baz",
},
}},
[]config.Rule{{
Record: "foo",
Expr: "min(up)",
Labels: map[string]string{
"baz": "bar",
},
}},
},
{
"empty rule",
[]*Rule{{Name: "foo"}},
[]*Rule{},
[]config.Rule{{Alert: "foo"}, {Record: "bar"}},
nil,
},
{
"multiple rules",
[]*Rule{{Name: "bar"}, {Name: "baz"}, {Name: "foo"}},
[]*Rule{{Name: "baz"}, {Name: "foo"}},
[]config.Rule{
{Alert: "bar"},
{Alert: "baz"},
{Alert: "foo"},
},
[]config.Rule{
{Alert: "baz"},
{Record: "foo"},
},
},
{
"replace rule",
[]*Rule{{Name: "foo1"}},
[]*Rule{{Name: "foo2"}},
[]config.Rule{{Alert: "foo1"}},
[]config.Rule{{Alert: "foo2"}},
},
{
"replace multiple rules",
[]*Rule{{Name: "foo1"}, {Name: "foo2"}},
[]*Rule{{Name: "foo3"}, {Name: "foo4"}},
[]config.Rule{
{Alert: "foo1"},
{Record: "foo2"},
{Alert: "foo3"},
},
[]config.Rule{
{Alert: "foo3"},
{Alert: "foo4"},
{Record: "foo5"},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Rules: tc.currentRules}
g.updateWith(Group{Rules: tc.newRules})
g := &Group{Name: "test"}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(r))
}
ng := &Group{Name: "test"}
for _, r := range tc.newRules {
r.ID = config.HashRule(r)
ng.Rules = append(ng.Rules, ng.newRule(r))
}
err := g.updateWith(ng)
if err != nil {
t.Fatal(err)
}
if len(g.Rules) != len(tc.newRules) {
t.Fatalf("expected to have %d rules; got: %d",
len(g.Rules), len(tc.newRules))
}
sort.Slice(g.Rules, func(i, j int) bool {
return g.Rules[i].Name < g.Rules[j].Name
return g.Rules[i].ID() < g.Rules[j].ID()
})
sort.Slice(ng.Rules, func(i, j int) bool {
return ng.Rules[i].ID() < ng.Rules[j].ID()
})
for i, r := range g.Rules {
got, want := r, tc.newRules[i]
if got.Name != want.Name {
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
got, want := r, ng.Rules[i]
if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got)
}
if got.Expr != want.Expr {
t.Fatalf("expected to have expression %q; got %q", want.Expr, got.Expr)
}
if got.For != want.For {
t.Fatalf("expected to have for %q; got %q", want.For, got.For)
}
if !reflect.DeepEqual(got.Annotations, want.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", want.Annotations, got.Annotations)
}
if !reflect.DeepEqual(got.Labels, want.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", want.Labels, got.Labels)
if err := compareRules(t, got, want); err != nil {
t.Fatalf("comparsion error: %s", err)
}
}
})
@@ -107,11 +145,13 @@ func TestUpdateWith(t *testing.T) {
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := Parse([]string{"testdata/rules1-good.rules"}, true)
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true, true)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
g := groups[0]
const evalInterval = time.Millisecond
g := newGroup(groups[0], evalInterval)
g.Concurrency = 2
fn := &fakeNotifier{}
fs := &fakeQuerier{}
@@ -120,27 +160,26 @@ func TestGroupStart(t *testing.T) {
m1 := metricWithLabels(t, "instance", inst1, "job", job)
m2 := metricWithLabels(t, "instance", inst2, "job", job)
r := g.Rules[0]
alert1, err := r.newAlert(m1)
r := g.Rules[0].(*AlertingRule)
alert1, err := r.newAlert(m1, time.Now())
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert1.State = notifier.StateFiring
alert1.ID = hash(m1)
alert2, err := r.newAlert(m2)
alert2, err := r.newAlert(m2, time.Now())
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert2.State = notifier.StateFiring
alert2.ID = hash(m2)
const evalInterval = time.Millisecond
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
go func() {
g.start(context.Background(), evalInterval, fs, fn, nil)
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
close(finished)
}()
@@ -166,52 +205,3 @@ func TestGroupStart(t *testing.T) {
g.close()
<-finished
}
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
t.Helper()
if len(as) != len(bs) {
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
}
sort.Slice(as, func(i, j int) bool {
return as[i].ID < as[j].ID
})
sort.Slice(bs, func(i, j int) bool {
return bs[i].ID < bs[j].ID
})
for i := range as {
a, b := as[i], bs[i]
if a.Name != b.Name {
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
}
if a.State != b.State {
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
}
if a.Value != b.Value {
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
}
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
}
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}

232
app/vmalert/helpers_test.go Normal file
View File

@@ -0,0 +1,232 @@
package main
import (
"context"
"fmt"
"reflect"
"sort"
"sync"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
err error
}
func (fq *fakeQuerier) setErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
return cp, nil
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
}
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Value = value
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := compareRules(t, want, got); err != nil {
t.Fatalf("comparsion error: %s", err)
}
}
}
func compareRules(t *testing.T, a, b Rule) error {
t.Helper()
switch v := a.(type) {
case *AlertingRule:
br, ok := b.(*AlertingRule)
if !ok {
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
}
return compareAlertingRules(t, v, br)
case *RecordingRule:
br, ok := b.(*RecordingRule)
if !ok {
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
}
return compareRecordingRules(t, v, br)
default:
return fmt.Errorf("unexpected rule type received %T", a)
}
}
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
t.Helper()
if a.Expr != b.Expr {
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
return nil
}
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
t.Helper()
if a.Expr != b.Expr {
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
}
if a.For != b.For {
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
return nil
}
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
t.Helper()
if len(a) != len(b) {
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
}
for i := range a {
expTS, gotTS := a[i], b[i]
if len(expTS.Samples) != len(gotTS.Samples) {
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
}
for i, exp := range expTS.Samples {
got := gotTS.Samples[i]
if got.Value != exp.Value {
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
}
// timestamp validation isn't always correct for now.
// this must be improved with time mock.
/*if got.Timestamp != exp.Timestamp {
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
}*/
}
if len(expTS.Labels) != len(gotTS.Labels) {
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
}
for i, exp := range expTS.Labels {
got := gotTS.Labels[i]
if got.Name != exp.Name {
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
}
if got.Value != exp.Value {
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
}
}
}
return nil
}
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
t.Helper()
if len(as) != len(bs) {
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
}
sort.Slice(as, func(i, j int) bool {
return as[i].ID < as[j].ID
})
sort.Slice(bs, func(i, j int) bool {
return bs[i].ID < bs[j].ID
})
for i := range as {
a, b := as[i], bs[i]
if a.Name != b.Name {
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
}
if a.State != b.State {
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
}
if a.Value != b.Value {
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
}
}

View File

@@ -4,14 +4,15 @@ import (
"context"
"flag"
"fmt"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@@ -30,72 +31,34 @@ Examples:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.`)
validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
remoteWriteURL = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" in form of timeseries. E.g. http://127.0.0.1:8428")
remoteWriteUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
remoteWritePassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
remoteWriteMaxQueueSize = flag.Int("remoteWrite.maxQueueSize", 10e3, "Defines the max number of pending datapoints to remote write endpoint")
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
remoteReadURL = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
remoteReadUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
remoteReadPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
checkFlags()
ctx, cancel := context.WithCancel(context.Background())
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
manager, err := newManager(ctx)
if err != nil {
logger.Fatalf("can not get external url: %s ", err)
logger.Fatalf("failed to init: %s", err)
}
notifier.InitTemplateFunc(eu)
manager := &manager{
groups: make(map[uint64]*Group),
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
notifier: notifier.NewAlertManager(*notifierURL, func(group, alert string) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, alert)
}, &http.Client{}),
}
if *remoteWriteURL != "" {
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
Addr: *remoteWriteURL,
MaxQueueSize: *remoteWriteMaxQueueSize,
FlushInterval: *evaluationInterval,
BasicAuthUser: *remoteWriteUsername,
BasicAuthPass: *remoteWritePassword,
})
if err != nil {
logger.Fatalf("failed to init remotewrite client: %s", err)
}
manager.rw = c
}
if *remoteReadURL != "" {
manager.rr = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
}
if err := manager.start(ctx, *rulePath, *validateTemplates); err != nil {
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
logger.Fatalf("failed to start: %s", err)
}
@@ -108,7 +71,7 @@ func main() {
<-sigHup
configReloads.Inc()
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
if err := manager.update(ctx, *rulePath, *validateTemplates, false); err != nil {
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("error while reloading rules: %s", err)
@@ -139,6 +102,44 @@ var (
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
)
func newManager(ctx context.Context) (*manager, error) {
q, err := datasource.Init()
if err != nil {
return nil, fmt.Errorf("failed to init datasource: %w", err)
}
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
}
notifier.InitTemplateFunc(eu)
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
if err != nil {
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
}
nts, err := notifier.Init(aug)
if err != nil {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
querier: q,
notifiers: nts,
}
rw, err := remotewrite.Init(ctx)
if err != nil {
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
}
manager.rw = rw
rr, err := remoteread.Init()
if err != nil {
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
}
manager.rr = rr
return manager, nil
}
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
if externalURL != "" {
return url.Parse(externalURL)
@@ -158,13 +159,39 @@ func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
}
func checkFlags() {
if *notifierURL == "" {
flag.PrintDefaults()
logger.Fatalf("notifier.url is empty")
func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, validateTemplate bool) (notifier.AlertURLGenerator, error) {
if externalAlertSource == "" {
return func(alert notifier.Alert) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", externalURL, strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10))
}, nil
}
if *datasourceURL == "" {
flag.PrintDefaults()
logger.Fatalf("datasource.url is empty")
if validateTemplate {
if err := notifier.ValidateTemplates(map[string]string{
"tpl": externalAlertSource,
}); err != nil {
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
}
}
m := map[string]string{
"tpl": externalAlertSource,
}
return func(alert notifier.Alert) string {
templated, err := alert.ExecTemplate(m)
if err != nil {
logger.Errorf("can not exec source template %s", err)
}
return fmt.Sprintf("%s/%s", externalURL, templated["tpl"])
}, nil
}
func usage() {
const s = `
vmalert processes alerts and recording rules.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
}

53
app/vmalert/main_test.go Normal file
View File

@@ -0,0 +1,53 @@
package main
import (
"fmt"
"net/url"
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if u.String() != expURL {
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
}
h, _ := os.Hostname()
expURL = fmt.Sprintf("https://%s:4242", h)
u, err = getExternalURL("", "0.0.0.0:4242", true)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if u.String() != expURL {
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
}
}
func TestGetAlertURLGenerator(t *testing.T) {
testAlert := notifier.Alert{GroupID: 42, ID: 2, Value: 4}
u, _ := url.Parse("https://victoriametrics.com/path")
fn, err := getAlertURLGenerator(u, "", false)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if exp := "https://victoriametrics.com/path/api/v1/42/2/status"; exp != fn(testAlert) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
if err == nil {
t.Errorf("exptected tempalte validation error got nil")
}
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if exp := "https://victoriametrics.com/path/foo?query=4"; exp != fn(testAlert) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
}

View File

@@ -6,15 +6,17 @@ import (
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// manager controls group states
type manager struct {
storage datasource.Querier
notifier notifier.Notifier
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
rr datasource.Querier
@@ -25,7 +27,7 @@ type manager struct {
groups map[uint64]*Group
}
// AlertAPI generates APIAlert object from alert by its id(hash)
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
@@ -35,15 +37,19 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
return nil, fmt.Errorf("can't find group with id %q", gID)
}
for _, rule := range g.Rules {
if apiAlert := rule.AlertAPI(aID); apiAlert != nil {
ar, ok := rule.(*AlertingRule)
if !ok {
continue
}
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
return apiAlert, nil
}
}
return nil, fmt.Errorf("can't func alert with id %q in group %q", aID, g.Name)
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
}
func (m *manager) start(ctx context.Context, path []string, validate bool) error {
return m.update(ctx, path, validate, true)
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
return m.update(ctx, path, validateTpl, validateExpr, true)
}
func (m *manager) close() {
@@ -56,7 +62,7 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
if err != nil {
@@ -67,21 +73,22 @@ func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
m.wg.Add(1)
id := group.ID()
go func() {
group.start(ctx, *evaluationInterval, m.storage, m.notifier, m.rw)
group.start(ctx, m.querier, m.notifiers, m.rw)
m.wg.Done()
}()
m.groups[id] = &group
m.groups[id] = group
}
func (m *manager) update(ctx context.Context, path []string, validate, restore bool) error {
logger.Infof("reading alert rules configuration file from %q", strings.Join(path, ";"))
newGroups, err := Parse(path, validate)
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
if err != nil {
return fmt.Errorf("cannot parse configuration file: %s", err)
return fmt.Errorf("cannot parse configuration file: %w", err)
}
groupsRegistry := make(map[uint64]Group)
for _, ng := range newGroups {
groupsRegistry := make(map[uint64]*Group)
for _, cfg := range groupsCfg {
ng := newGroup(cfg, *evaluationInterval)
groupsRegistry[ng.ID()] = ng
}
@@ -106,3 +113,23 @@ func (m *manager) update(ctx context.Context, path []string, validate, restore b
m.groupsMu.Unlock()
return nil
}
func (g *Group) toAPI() APIGroup {
ag := APIGroup{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
File: g.File,
Interval: g.Interval.String(),
Concurrency: g.Concurrency,
}
for _, r := range g.Rules {
switch v := r.(type) {
case *AlertingRule:
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
case *RecordingRule:
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
}
}
return ag
}

View File

@@ -3,16 +3,26 @@ package main
import (
"context"
"math/rand"
"net/url"
"os"
"strings"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
func TestManagerUpdateError(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
path := []string{"foo/bar"}
err := m.update(context.Background(), path, true, false)
err := m.update(context.Background(), path, true, true, false)
if err == nil {
t.Fatalf("expected to have err; got nil instead")
}
@@ -27,17 +37,21 @@ func TestManagerUpdateError(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
storage: &fakeQuerier{},
notifier: &fakeNotifier{},
groups: make(map[uint64]*Group),
querier: &fakeQuerier{},
notifiers: []notifier.Notifier{&fakeNotifier{}},
}
paths := []string{
"testdata/dir/rules0-good.rules",
"testdata/dir/rules1-good.rules",
"testdata/rules0-good.rules",
"config/testdata/dir/rules0-good.rules",
"config/testdata/dir/rules0-bad.rules",
"config/testdata/dir/rules1-good.rules",
"config/testdata/dir/rules1-bad.rules",
"config/testdata/rules0-good.rules",
"config/testdata/rules1-good.rules",
"config/testdata/rules2-good.rules",
}
*evaluationInterval = time.Millisecond
if err := m.start(context.Background(), []string{paths[0]}, true); err != nil {
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
t.Fatalf("failed to start: %s", err)
}
@@ -51,10 +65,7 @@ func TestManagerUpdateConcurrent(t *testing.T) {
for i := 0; i < iterations; i++ {
rnd := rand.Intn(len(paths))
path := []string{paths[rnd]}
err := m.update(context.Background(), path, true, false)
if err != nil {
t.Errorf("update error: %s", err)
}
_ = m.update(context.Background(), path, true, true, false)
}
}()
}
@@ -64,6 +75,41 @@ func TestManagerUpdateConcurrent(t *testing.T) {
// TestManagerUpdate tests sequential configuration
// updates.
func TestManagerUpdate(t *testing.T) {
const defaultEvalInterval = time.Second * 30
currentEvalInterval := *evaluationInterval
*evaluationInterval = defaultEvalInterval
defer func() {
*evaluationInterval = currentEvalInterval
}()
var (
VMRows = &AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 10 * time.Second,
Labels: map[string]string{
"label": "bar",
"host": "{{ $labels.instance }}",
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
}
Conns = &AlertingRule{
Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{
"summary": "Too high connection number for {{$labels.instance}}",
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
}
ExampleAlertAlwaysFiring = &AlertingRule{
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
)
testCases := []struct {
name string
initPath string
@@ -72,49 +118,65 @@ func TestManagerUpdate(t *testing.T) {
}{
{
name: "update good rules",
initPath: "testdata/rules0-good.rules",
updatePath: "testdata/dir/rules1-good.rules",
initPath: "config/testdata/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{
{
File: "testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Rules: []*Rule{newTestRule("VMRows", time.Second*10)},
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 5 * time.Minute,
Labels: map[string]string{"label": "bar"},
Annotations: map[string]string{
"summary": "{{ $value }}",
"description": "{{$labels}}",
},
},
},
},
},
},
{
name: "update good rules from 1 to 2 groups",
initPath: "testdata/dir/rules1-good.rules",
updatePath: "testdata/rules0-good.rules",
initPath: "config/testdata/dir/rules1-good.rules",
updatePath: "config/testdata/rules0-good.rules",
want: []*Group{
{
File: "testdata/rules0-good.rules",
Name: "groupGorSingleAlert", Rules: []*Rule{
newTestRule("VMRows", time.Second*10),
}},
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
},
{
File: "testdata/rules0-good.rules",
Name: "TestGroup", Rules: []*Rule{
newTestRule("Conns", time.Duration(0)),
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
{
name: "update with one bad rule file",
initPath: "testdata/rules0-good.rules",
updatePath: "testdata/dir/rules2-bad.rules",
initPath: "config/testdata/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{
{
File: "testdata/rules0-good.rules",
Name: "groupGorSingleAlert", Rules: []*Rule{
newTestRule("VMRows", time.Second*10),
}},
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
},
{
File: "testdata/rules0-good.rules",
Name: "TestGroup", Rules: []*Rule{
newTestRule("Conns", time.Duration(0)),
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
@@ -122,14 +184,14 @@ func TestManagerUpdate(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{groups: make(map[uint64]*Group), storage: &fakeQuerier{}}
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
path := []string{tc.initPath}
if err := m.update(ctx, path, true, false); err != nil {
if err := m.update(ctx, path, true, true, false); err != nil {
t.Fatalf("failed to complete initial rules update: %s", err)
}
path = []string{tc.updatePath}
_ = m.update(ctx, path, true, false)
_ = m.update(ctx, path, true, true, false)
if len(tc.want) != len(m.groups) {
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
}
@@ -139,7 +201,7 @@ func TestManagerUpdate(t *testing.T) {
if !ok {
t.Fatalf("expected to have group %q", wantG.Name)
}
compareGroups(t, gotG, wantG)
compareGroups(t, wantG, gotG)
}
cancel()
@@ -147,17 +209,3 @@ func TestManagerUpdate(t *testing.T) {
})
}
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if got.Name != want.Name {
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
}
}
}

View File

@@ -7,6 +7,8 @@ import (
"strings"
"text/template"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
// Alert the triggered alert
@@ -77,7 +79,7 @@ func ValidateTemplates(annotations map[string]string) error {
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
var builder strings.Builder
var buf bytes.Buffer
eg := errGroup{}
eg := new(utils.ErrGroup)
r := make(map[string]string, len(annotations))
for key, text := range annotations {
r[key] = text
@@ -87,21 +89,21 @@ func templateAnnotations(annotations map[string]string, header string, data aler
builder.WriteString(header)
builder.WriteString(text)
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
eg.errs = append(eg.errs, fmt.Sprintf("key %s, template %s:%s", key, text, err))
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
continue
}
r[key] = buf.String()
}
return r, eg.err()
return r, eg.Err()
}
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation:%w", err)
return fmt.Errorf("error parsing annotation: %w", err)
}
if err = tpl.Execute(dst, data); err != nil {
return fmt.Errorf("error evaluating annotation template:%w", err)
return fmt.Errorf("error evaluating annotation template: %w", err)
}
return nil
}

View File

@@ -1,13 +1,10 @@
package notifier
import (
"net/url"
"testing"
)
func TestAlert_ExecTemplate(t *testing.T) {
u, _ := url.Parse("https://victoriametrics.com/path")
InitTemplateFunc(u)
testCases := []struct {
name string
alert *Alert

View File

@@ -12,9 +12,11 @@ import (
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
alertURL string
argFunc AlertURLGenerator
client *http.Client
alertURL string
basicAuthUser string
basicAuthPass string
argFunc AlertURLGenerator
client *http.Client
}
// Send an alert or resolve message
@@ -28,6 +30,9 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
}
req.Header.Set("Content-Type", "application/json")
req = req.WithContext(ctx)
if am.basicAuthPass != "" {
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
}
resp, err := am.client.Do(req)
if err != nil {
return err
@@ -38,7 +43,7 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
if resp.StatusCode != http.StatusOK {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response from %q: %s", am.alertURL, err)
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
}
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
}
@@ -46,15 +51,18 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
}
// AlertURLGenerator returns URL to single alert by given name
type AlertURLGenerator func(group, alert string) string
type AlertURLGenerator func(Alert) string
const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
return &AlertManager{
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath,
argFunc: fn,
client: c,
alertURL: addr,
argFunc: fn,
client: c,
basicAuthUser: user,
basicAuthPass: pass,
}
}

View File

@@ -1,15 +1,14 @@
{% import (
"strconv"
"time"
) %}
{% stripspace %}
{% func amRequest(alerts []Alert, generatorURL func(string, string) string) %}
{% func amRequest(alerts []Alert, generatorURL func(Alert) string) %}
[
{% for i, alert := range alerts %}
{
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)) %},
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL(alert) %},
{% if !alert.End.IsZero() %}
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
{% endif %}

View File

@@ -6,126 +6,125 @@ package notifier
//line app/vmalert/notifier/alertmanager_request.qtpl:1
import (
"strconv"
"time"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
//line app/vmalert/notifier/alertmanager_request.qtpl:6
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
//line app/vmalert/notifier/alertmanager_request.qtpl:6
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:7
//line app/vmalert/notifier/alertmanager_request.qtpl:6
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:6
qw422016.N().S(`[`)
//line app/vmalert/notifier/alertmanager_request.qtpl:9
//line app/vmalert/notifier/alertmanager_request.qtpl:8
for i, alert := range alerts {
//line app/vmalert/notifier/alertmanager_request.qtpl:9
//line app/vmalert/notifier/alertmanager_request.qtpl:8
qw422016.N().S(`{"startsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:11
//line app/vmalert/notifier/alertmanager_request.qtpl:10
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:11
//line app/vmalert/notifier/alertmanager_request.qtpl:10
qw422016.N().S(`,"generatorURL":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().Q(generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)))
//line app/vmalert/notifier/alertmanager_request.qtpl:12
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().Q(generatorURL(alert))
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:13
//line app/vmalert/notifier/alertmanager_request.qtpl:12
if !alert.End.IsZero() {
//line app/vmalert/notifier/alertmanager_request.qtpl:13
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().S(`"endsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:14
//line app/vmalert/notifier/alertmanager_request.qtpl:13
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:13
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:14
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:15
}
//line app/vmalert/notifier/alertmanager_request.qtpl:15
//line app/vmalert/notifier/alertmanager_request.qtpl:14
qw422016.N().S(`"labels": {"alertname":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:17
//line app/vmalert/notifier/alertmanager_request.qtpl:16
qw422016.N().Q(alert.Name)
//line app/vmalert/notifier/alertmanager_request.qtpl:18
//line app/vmalert/notifier/alertmanager_request.qtpl:17
for k, v := range alert.Labels {
//line app/vmalert/notifier/alertmanager_request.qtpl:18
//line app/vmalert/notifier/alertmanager_request.qtpl:17
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:20
//line app/vmalert/notifier/alertmanager_request.qtpl:19
}
//line app/vmalert/notifier/alertmanager_request.qtpl:20
//line app/vmalert/notifier/alertmanager_request.qtpl:19
qw422016.N().S(`},"annotations": {`)
//line app/vmalert/notifier/alertmanager_request.qtpl:23
//line app/vmalert/notifier/alertmanager_request.qtpl:22
c := len(alert.Annotations)
//line app/vmalert/notifier/alertmanager_request.qtpl:24
//line app/vmalert/notifier/alertmanager_request.qtpl:23
for k, v := range alert.Annotations {
//line app/vmalert/notifier/alertmanager_request.qtpl:25
//line app/vmalert/notifier/alertmanager_request.qtpl:24
c = c - 1
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
if c > 0 {
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
//line app/vmalert/notifier/alertmanager_request.qtpl:25
}
//line app/vmalert/notifier/alertmanager_request.qtpl:27
//line app/vmalert/notifier/alertmanager_request.qtpl:26
}
//line app/vmalert/notifier/alertmanager_request.qtpl:27
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().S(`}}`)
//line app/vmalert/notifier/alertmanager_request.qtpl:30
//line app/vmalert/notifier/alertmanager_request.qtpl:29
if i != len(alerts)-1 {
//line app/vmalert/notifier/alertmanager_request.qtpl:30
//line app/vmalert/notifier/alertmanager_request.qtpl:29
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:30
//line app/vmalert/notifier/alertmanager_request.qtpl:29
}
//line app/vmalert/notifier/alertmanager_request.qtpl:31
//line app/vmalert/notifier/alertmanager_request.qtpl:30
}
//line app/vmalert/notifier/alertmanager_request.qtpl:31
//line app/vmalert/notifier/alertmanager_request.qtpl:30
qw422016.N().S(`]`)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}
//line app/vmalert/notifier/alertmanager_request.qtpl:33
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
streamamRequest(qw422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}
//line app/vmalert/notifier/alertmanager_request.qtpl:33
func amRequest(alerts []Alert, generatorURL func(string, string) string) string {
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
func amRequest(alerts []Alert, generatorURL func(Alert) string) string {
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
writeamRequest(qb422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qs422016 := string(qb422016.B)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
return qs422016
//line app/vmalert/notifier/alertmanager_request.qtpl:33
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}

View File

@@ -5,17 +5,27 @@ import (
"encoding/json"
"net/http"
"net/http/httptest"
"strconv"
"testing"
"time"
)
func TestAlertManager_Send(t *testing.T) {
const baUser, baPass = "foo", "bar"
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
user, pass, ok := r.BasicAuth()
if !ok {
t.Errorf("unauthorized request")
}
if user != baUser || pass != baPass {
t.Errorf("wrong creds %q:%q; expected %q:%q",
user, pass, baUser, baPass)
}
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
@@ -57,8 +67,8 @@ func TestAlertManager_Send(t *testing.T) {
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewAlertManager(srv.URL, func(group, name string) string {
return group + "/" + name
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
}, srv.Client())
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
t.Error("expected connection error got nil")

View File

@@ -0,0 +1,47 @@
package notifier
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
)
var (
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -datasource.url")
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
"By default system CA is used")
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
"By default the server name from -notifier.url is used")
)
// Init creates a Notifier object based on provided flags.
func Init(gen AlertURLGenerator) ([]Notifier, error) {
if len(*addrs) == 0 {
flag.PrintDefaults()
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
}
var notifiers []Notifier
for i, addr := range *addrs {
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
tr, err := utils.Transport(addr, cert, key, ca, serverName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
notifiers = append(notifiers, am)
}
return notifiers, nil
}

View File

@@ -0,0 +1,13 @@
package notifier
import (
"net/url"
"os"
"testing"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
InitTemplateFunc(u)
os.Exit(m.Run())
}

View File

@@ -1,21 +0,0 @@
package notifier
import (
"fmt"
"strings"
)
type errGroup struct {
errs []string
}
func (eg *errGroup) err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
func (eg *errGroup) Error() string {
return fmt.Sprintf("errors:%s", strings.Join(eg.errs, "\n"))
}

149
app/vmalert/recording.go Normal file
View File

@@ -0,0 +1,149 @@
package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// RecordingRule is a Rule that supposed
// to evaluate configured Expression and
// return TimeSeries as result.
type RecordingRule struct {
RuleID uint64
Name string
Expr string
Labels map[string]string
GroupID uint64
// guard status fields
mu sync.RWMutex
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
}
// String implements Stringer interface
func (rr *RecordingRule) String() string {
return rr.Name
}
// ID returns unique Rule ID
// within the parent Group.
func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
return &RecordingRule{
RuleID: cfg.ID,
Name: cfg.Record,
Expr: cfg.Expr,
Labels: cfg.Labels,
GroupID: gID,
}
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
if !series {
return nil, nil
}
qMetrics, err := q.Query(ctx, rr.Expr)
rr.mu.Lock()
defer rr.mu.Unlock()
rr.lastExecTime = time.Now()
rr.lastExecError = err
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
}
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
var tss []prompbmarshal.TimeSeries
for _, r := range qMetrics {
ts := rr.toTimeSeries(r, rr.lastExecTime)
h := hashTimeSeries(ts)
if _, ok := duplicates[h]; ok {
rr.lastExecError = errDuplicate
return nil, errDuplicate
}
duplicates[h] = ts
tss = append(tss, ts)
}
return tss, nil
}
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
hash := fnv.New64a()
labels := ts.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for _, l := range m.Labels {
labels[l.Name] = l.Value
}
labels["__name__"] = rr.Name
// override existing labels with configured ones
for k, v := range rr.Labels {
labels[k] = v
}
return newTimeSeries(m.Value, labels, timestamp)
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (rr *RecordingRule) UpdateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
}
rr.Expr = nr.Expr
rr.Labels = nr.Labels
return nil
}
// RuleAPI returns Rule representation in form
// of APIRecordingRule
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
var lastErr string
if rr.lastExecError != nil {
lastErr = rr.lastExecError.Error()
}
return APIRecordingRule{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
Name: rr.Name,
Expression: rr.Expr,
LastError: lastErr,
LastExec: rr.lastExecTime,
Labels: rr.Labels,
}
}

View File

@@ -0,0 +1,121 @@
package main
import (
"context"
"errors"
"strings"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *RecordingRule
metrics []datasource.Metric
expTS []prompbmarshal.TimeSeries
}{
{
&RecordingRule{Name: "foo"},
[]datasource.Metric{metricWithValueAndLabels(t, 10,
"__name__", "bar",
)},
[]prompbmarshal.TimeSeries{
newTimeSeries(10, map[string]string{
"__name__": "foo",
}, timestamp),
},
},
{
&RecordingRule{Name: "foobarbaz"},
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": "foobarbaz",
"job": "foo",
}, timestamp),
newTimeSeries(2, map[string]string{
"__name__": "foobarbaz",
"job": "bar",
}, timestamp),
newTimeSeries(3, map[string]string{
"__name__": "foobarbaz",
"job": "baz",
}, timestamp),
},
},
{
&RecordingRule{Name: "job:foo", Labels: map[string]string{
"source": "test",
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
[]prompbmarshal.TimeSeries{
newTimeSeries(2, map[string]string{
"__name__": "job:foo",
"job": "foo",
"source": "test",
}, timestamp),
newTimeSeries(1, map[string]string{
"__name__": "job:foo",
"job": "bar",
"source": "test",
}, timestamp),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
tss, err := tc.rule.Exec(context.TODO(), fq, true)
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
"job": "test",
}}
fq := &fakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err := rr.Exec(context.TODO(), fq, true)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
fq.reset()
// add metrics which differs only by `job` label
// which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), fq, true)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), errDuplicate.Error()) {
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
}
}

View File

@@ -0,0 +1,39 @@
package remoteread
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
"By default the server name from -remoteRead.url is used")
)
// Init creates a Querier from provided flag values.
// Returns nil if addr flag wasn't set.
func Init() (datasource.Querier, error) {
if *addr == "" {
return nil, nil
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &http.Client{Transport: tr}
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
}

View File

@@ -0,0 +1,54 @@
package remotewrite
import (
"context"
"flag"
"fmt"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
)
// Init creates Client object from given flags.
// Returns nil if addr flag wasn't set.
func Init(ctx context.Context) (*Client, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
return NewClient(ctx, Config{
Addr: *addr,
Concurrency: *concurrency,
MaxQueueSize: *maxQueueSize,
MaxBatchSize: *maxBatchSize,
FlushInterval: *flushInterval,
BasicAuthUser: *basicAuthUsername,
BasicAuthPass: *basicAuthPassword,
Transport: t,
})
}

View File

@@ -38,23 +38,30 @@ type Config struct {
BasicAuthUser string
BasicAuthPass string
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// WriteTimeout defines timeout for HTTP write request
// to remote storage
WriteTimeout time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 100
defaultFlushInterval = 5 * time.Second
defaultMaxQueueSize = 1e5
defaultFlushInterval = time.Second
defaultWriteTimeout = 30 * time.Second
)
@@ -80,7 +87,8 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
}
c := &Client{
c: &http.Client{
Timeout: cfg.WriteTimeout,
Timeout: cfg.WriteTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
baUser: cfg.BasicAuthUser,
@@ -90,7 +98,13 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
c.run(ctx)
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
@@ -128,7 +142,10 @@ func (c *Client) run(ctx context.Context) {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
if len(wr.Timeseries) < 1 {
return
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
c.flush(lastCtx, wr)
cancel()
}

View File

@@ -2,339 +2,23 @@ package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
"strconv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metricsql"
)
// Rule is basic alert entity
type Rule struct {
Name string `yaml:"alert"`
Expr string `yaml:"expr"`
For time.Duration `yaml:"for"`
Labels map[string]string `yaml:"labels"`
Annotations map[string]string `yaml:"annotations"`
group Group
// guard status fields
mu sync.RWMutex
// stores list of active alerts
alerts map[uint64]*notifier.Alert
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
}
func (r *Rule) id() string {
return r.Name
}
// Validate validates rule
func (r *Rule) Validate() error {
if r.Name == "" {
return errors.New("rule name can not be empty")
}
if r.Expr == "" {
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
}
if _, err := metricsql.Parse(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
}
return nil
}
// Exec executes Rule expression via the given Querier.
// Based on the Querier results Rule maintains notifier.Alerts
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
qMetrics, err := q.Query(ctx, r.Expr)
r.mu.Lock()
defer r.mu.Unlock()
r.lastExecError = err
r.lastExecTime = time.Now()
if err != nil {
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
}
for h, a := range r.alerts {
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive {
delete(r.alerts, h)
}
}
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
h := hash(m)
updated[h] = struct{}{}
if a, ok := r.alerts[h]; ok {
if a.Value != m.Value {
// update Value field with latest value
a.Value = m.Value
// and re-exec template since Value can be used
// in templates
err = r.template(a)
if err != nil {
return err
}
}
continue
}
a, err := r.newAlert(m)
if err != nil {
r.lastExecError = err
return fmt.Errorf("failed to create alert: %s", err)
}
a.ID = h
a.State = notifier.StatePending
r.alerts[h] = a
}
for h, a := range r.alerts {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending {
// alert was in Pending state - it is not
// active anymore
delete(r.alerts, h)
continue
}
a.State = notifier.StateInactive
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
}
return nil
}
// TODO: consider hashing algorithm in VM
func hash(m datasource.Metric) uint64 {
hash := fnv.New64a()
labels := m.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
a := &notifier.Alert{
GroupID: r.group.ID(),
Name: r.Name,
Labels: map[string]string{},
Value: m.Value,
Start: time.Now(),
Expr: r.Expr,
// TODO: support End time
}
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
a.Labels[l.Name] = l.Value
}
return a, r.template(a)
}
func (r *Rule) template(a *notifier.Alert) error {
// 1. template rule labels with data labels
rLabels, err := a.ExecTemplate(r.Labels)
if err != nil {
return err
}
// 2. merge data labels and rule labels
// metric labels may be overridden by
// rule labels
for k, v := range rLabels {
a.Labels[k] = v
}
// 3. template merged labels
a.Labels, err = a.ExecTemplate(a.Labels)
if err != nil {
return err
}
a.Annotations, err = a.ExecTemplate(r.Annotations)
return err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (r *Rule) AlertAPI(id uint64) *APIAlert {
r.mu.RLock()
defer r.mu.RUnlock()
a, ok := r.alerts[id]
if !ok {
return nil
}
return r.newAlertAPI(*a)
}
// AlertsAPI generates list of APIAlert objects from existing alerts
func (r *Rule) AlertsAPI() []*APIAlert {
var alerts []*APIAlert
r.mu.RLock()
for _, a := range r.alerts {
alerts = append(alerts, r.newAlertAPI(*a))
}
r.mu.RUnlock()
return alerts
}
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
return &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
Name: a.Name,
Expression: r.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.Start,
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
}
}
const (
// AlertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
// AlertForStateMetricName is the metric name for 'for' state of alert.
alertForStateMetricName = "ALERTS_FOR_STATE"
// AlertNameLabel is the label name indicating the name of an alert.
alertNameLabel = "alertname"
// AlertStateLabel is the label name indicating the state of an alert.
alertStateLabel = "alertstate"
)
// AlertToTimeSeries converts the given alert with the given timestamp to timeseries
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
if r.For > 0 {
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
}
return tss
}
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertMetricName
labels[alertNameLabel] = name
labels[alertStateLabel] = a.State.String()
return newTimeSeries(1, labels, timestamp)
}
// alertForToTimeSeries returns a timeseries that represents
// state of active alerts, where value is time when alert become active
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertForStateMetricName
labels[alertNameLabel] = name
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{}
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamp.UnixNano() / 1e6,
})
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, key := range keys {
ts.Labels = append(ts.Labels, prompbmarshal.Label{
Name: key,
Value: labels[key],
})
}
return ts
}
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
// Get the last datapoint in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
alertForStateMetricName, r.Name, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
}
for _, m := range qMetrics {
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Exec
for _, l := range labels {
if l.Name == alertNameLabel {
continue
}
// drop all overridden labels
if _, ok := r.Labels[l.Name]; ok {
continue
}
m.Labels = append(m.Labels, l)
}
a, err := r.newAlert(m)
if err != nil {
return fmt.Errorf("failed to create alert: %s", err)
}
a.ID = hash(m)
a.State = notifier.StatePending
a.Start = time.Unix(int64(m.Value), 0)
r.alerts[a.ID] = a
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
}
return nil
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// Returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context
// and Querier. If returnSeries is true, Exec
// may return TimeSeries as result of execution
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
}

28
app/vmalert/utils.go Normal file
View File

@@ -0,0 +1,28 @@
package main
import (
"sort"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{}
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamp.UnixNano() / 1e6,
})
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, key := range keys {
ts.Labels = append(ts.Labels, prompbmarshal.Label{
Name: key,
Value: labels[key],
})
}
return ts
}

View File

@@ -0,0 +1,43 @@
package utils
import (
"fmt"
"strings"
)
// ErrGroup accumulates multiple errors
// and produces single error message.
type ErrGroup struct {
errs []error
}
// Add adds a new error to group.
// Isn't thread-safe.
func (eg *ErrGroup) Add(err error) {
eg.errs = append(eg.errs, err)
}
// Err checks if group contains at least
// one error.
func (eg *ErrGroup) Err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
// Error satisfies Error interface
func (eg *ErrGroup) Error() string {
if len(eg.errs) == 0 {
return ""
}
var b strings.Builder
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
for i, err := range eg.errs {
b.WriteString(err.Error())
if i != len(eg.errs)-1 {
b.WriteString("\n")
}
}
return b.String()
}

View File

@@ -0,0 +1,38 @@
package utils
import (
"errors"
"testing"
)
func TestErrGroup(t *testing.T) {
testCases := []struct {
errs []error
exp string
}{
{nil, ""},
{[]error{errors.New("timeout")}, "errors(1): timeout"},
{
[]error{errors.New("timeout"), errors.New("deadline")},
"errors(2): timeout\ndeadline",
},
}
for _, tc := range testCases {
eg := new(ErrGroup)
for _, err := range tc.errs {
eg.Add(err)
}
if len(tc.errs) == 0 {
if eg.Err() != nil {
t.Fatalf("expected to get nil error")
}
continue
}
if eg.Err() == nil {
t.Fatalf("expected to get non-nil error")
}
if eg.Error() != tc.exp {
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
}
}
}

58
app/vmalert/utils/tls.go Normal file
View File

@@ -0,0 +1,58 @@
package utils
import (
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
"net/http"
"strings"
)
// Transport creates http.Transport object based on provided URL.
// Returns Transport with TLS configuration if URL contains `https` prefix
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
t := http.DefaultTransport.(*http.Transport).Clone()
if !strings.HasPrefix(URL, "https") {
return t, nil
}
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
return nil, err
}
t.TLSClientConfig = tlsCfg
return t, nil
}
// TLSConfig creates tls.Config object from provided arguments
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
var certs []tls.Certificate
if certFile != "" {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
}
certs = []tls.Certificate{cert}
}
var rootCAs *x509.CertPool
if CAFile != "" {
pem, err := ioutil.ReadFile(CAFile)
if err != nil {
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
}
rootCAs = x509.NewCertPool()
if !rootCAs.AppendCertsFromPEM(pem) {
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
}
}
return &tls.Config{
Certificates: certs,
InsecureSkipVerify: insecureSkipVerify,
RootCAs: rootCAs,
ServerName: serverName,
}, nil
}

View File

@@ -0,0 +1,52 @@
package utils
import "testing"
func TestTLSConfig(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
serverName = "test"
insecureSkipVerify = true
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tlsCfg == nil {
t.Errorf("expected tlsConfig to be set, got nil")
}
if tlsCfg.ServerName != serverName {
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
}
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
}
certFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected keypair error, got nil")
}
certFile = ""
CAFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected read error, got nil")
}
}
func TestTransport(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
URL := "http://victoriametrics.com"
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
URL = "https://victoriametrics.com"
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tr.TLSClientConfig == nil {
t.Errorf("expected TLSClientConfig to be set, got nil")
}
}

View File

@@ -7,32 +7,18 @@ import (
"sort"
"strconv"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
// APIAlert represents an notifier.Alert state
// for WEB view
type APIAlert struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
State string `json:"state"`
Value string `json:"value"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
ActiveAt time.Time `json:"activeAt"`
}
type requestHandler struct {
m *manager
}
var pathList = [][]string{
{"/api/v1/groups", "list all loaded groups and rules"},
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
// /metrics is served by httpserver by default
@@ -49,8 +35,11 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
}
return true
case "/api/v1/groups":
resph.handle(rh.listGroups())
return true
case "/api/v1/alerts":
resph.handle(rh.list())
resph.handle(rh.listAlerts())
return true
case "/-/reload":
logger.Infof("api config reload was called, sending sighup")
@@ -67,6 +56,37 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
}
}
type listGroupsResponse struct {
Data struct {
Groups []APIGroup `json:"groups"`
} `json:"data"`
Status string `json:"status"`
}
func (rh *requestHandler) listGroups() ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
lr := listGroupsResponse{Status: "success"}
for _, g := range rh.m.groups {
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
}
// sort list of alerts for deterministic output
sort.Slice(lr.Data.Groups, func(i, j int) bool {
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
})
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
return b, nil
}
type listAlertsResponse struct {
Data struct {
Alerts []*APIAlert `json:"alerts"`
@@ -74,13 +94,18 @@ type listAlertsResponse struct {
Status string `json:"status"`
}
func (rh *requestHandler) list() ([]byte, error) {
func (rh *requestHandler) listAlerts() ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
for _, g := range rh.m.groups {
for _, r := range g.Rules {
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
a, ok := r.(*AlertingRule)
if !ok {
continue
}
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
}
}
@@ -92,7 +117,7 @@ func (rh *requestHandler) list() ([]byte, error) {
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
@@ -113,11 +138,11 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
groupID, err := uint64FromPath(parts[0])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %s`, err))
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
}
alertID, err := uint64FromPath(parts[1])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %s`, err))
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
}
resp, err := rh.m.AlertAPI(groupID, alertID)
if err != nil {

View File

@@ -11,7 +11,7 @@ import (
)
func TestHandler(t *testing.T) {
rule := &Rule{
ar := &AlertingRule{
Name: "alert",
alerts: map[uint64]*notifier.Alert{
0: {},
@@ -19,7 +19,7 @@ func TestHandler(t *testing.T) {
}
g := &Group{
Name: "group",
Rules: []*Rule{rule},
Rules: []Rule{ar},
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
@@ -54,10 +54,17 @@ func TestHandler(t *testing.T) {
t.Errorf("expected 1 alert got %d", length)
}
})
t.Run("/api/v1/groups", func(t *testing.T) {
lr := listGroupsResponse{}
getResp(ts.URL+"/api/v1/groups", &lr, 200)
if length := len(lr.Data.Groups); length != 1 {
t.Errorf("expected 1 group got %d", length)
}
})
t.Run("/api/v1/0/0/status", func(t *testing.T) {
alert := &APIAlert{}
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
expAlert := rule.newAlertAPI(*rule.alerts[0])
expAlert := ar.newAlertAPI(*ar.alerts[0])
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}

54
app/vmalert/web_types.go Normal file
View File

@@ -0,0 +1,54 @@
package main
import (
"time"
)
// APIAlert represents an notifier.AlertingRule state
// for WEB view
type APIAlert struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
State string `json:"state"`
Value string `json:"value"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
ActiveAt time.Time `json:"activeAt"`
}
// APIGroup represents Group for WEB view
type APIGroup struct {
Name string `json:"name"`
ID string `json:"id"`
File string `json:"file"`
Interval string `json:"interval"`
Concurrency int `json:"concurrency"`
AlertingRules []APIAlertingRule `json:"alerting_rules"`
RecordingRules []APIRecordingRule `json:"recording_rules"`
}
// APIAlertingRule represents AlertingRule for WEB view
type APIAlertingRule struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
For string `json:"for"`
LastError string `json:"last_error"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
}
// APIRecordingRule represents RecordingRule for WEB view
type APIRecordingRule struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
LastError string `json:"last_error"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
}

View File

@@ -23,7 +23,7 @@ Docker images for `vmauth` are available [here](https://hub.docker.com/r/victori
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, accounting, limits, etc.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
### Auth config
@@ -110,11 +110,11 @@ Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker i
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmauth
ROOT_IMAGE=scratch make package-vmauth
```

View File

@@ -36,11 +36,11 @@ type UserInfo struct {
func initAuthConfig() {
if len(*authConfigPath) == 0 {
logger.Panicf("FATAL: missing required `-auth.config` command-line flag")
logger.Fatalf("missing required `-auth.config` command-line flag")
}
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Panicf("FATAL: cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
}
authConfig.Store(m)
stopCh = make(chan struct{})
@@ -63,12 +63,14 @@ func authConfigReloader() {
case <-stopCh:
return
case <-sighupCh:
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
continue
}
authConfig.Store(m)
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
}
}
}
@@ -80,11 +82,11 @@ var stopCh chan struct{}
func readAuthConfig(path string) (map[string]*UserInfo, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("cannot read %q: %s", path, err)
return nil, fmt.Errorf("cannot read %q: %w", path, err)
}
m, err := parseAuthConfig(data)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %s", path, err)
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
}
logger.Infof("Loaded information about %d users from %q", len(m), path)
return m, nil
@@ -93,7 +95,7 @@ func readAuthConfig(path string) (map[string]*UserInfo, error) {
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
var ac AuthConfig
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %s", err)
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
}
uis := ac.Users
if len(uis) == 0 {
@@ -113,7 +115,7 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
// Validate urlPrefix
target, err := url.Parse(urlPrefix)
if err != nil {
return nil, fmt.Errorf("invalid `url_prefix: %q`: %s", urlPrefix, err)
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
}
if target.Scheme != "http" && target.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)

View File

@@ -2,6 +2,7 @@ package main
import (
"flag"
"fmt"
"net/http"
"net/http/httputil"
"net/url"
@@ -22,6 +23,7 @@ var (
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
@@ -77,6 +79,26 @@ var reverseProxy = &httputil.ReverseProxy{
}
r.URL = target
},
Transport: func() *http.Transport {
tr := http.DefaultTransport.(*http.Transport).Clone()
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
tr.DisableCompression = true
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
tr.ForceAttemptHTTP2 = false
return tr
}(),
FlushInterval: time.Second,
ErrorLog: logger.StdErrorLogger(),
}
func usage() {
const s = `
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
}

View File

@@ -89,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
### How does it work?
@@ -121,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
### Advanced usage
@@ -197,9 +201,9 @@ Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` dock
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmbackup
ROOT_IMAGE=scratch make package-vmbackup
```

View File

@@ -110,12 +110,12 @@ func newSrcFS() (*fslocal.FS, error) {
// Verify the snapshot exists.
f, err := os.Open(snapshotPath)
if err != nil {
return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
}
fi, err := f.Stat()
_ = f.Close()
if err != nil {
return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
}
if !fi.IsDir() {
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
@@ -126,7 +126,7 @@ func newSrcFS() (*fslocal.FS, error) {
MaxBytesPerSecond: *maxBytesPerSecond,
}
if err := fs.Init(); err != nil {
return nil, fmt.Errorf("cannot initialize fs: %s", err)
return nil, fmt.Errorf("cannot initialize fs: %w", err)
}
return fs, nil
}
@@ -134,7 +134,7 @@ func newSrcFS() (*fslocal.FS, error) {
func newDstFS() (common.RemoteFS, error) {
fs, err := actions.NewRemoteFS(*dst)
if err != nil {
return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
}
return fs, nil
}
@@ -145,7 +145,7 @@ func newOriginFS() (common.RemoteFS, error) {
}
fs, err := actions.NewRemoteFS(*origin)
if err != nil {
return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
}
return fs, nil
}

View File

@@ -122,7 +122,7 @@ func (ctx *InsertCtx) AddLabel(name, value string) {
func (ctx *InsertCtx) FlushBufs() error {
if err := vmstorage.AddRows(ctx.mrs); err != nil {
return &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("cannot store metrics: %s", err),
Err: fmt.Errorf("cannot store metrics: %w", err),
StatusCode: http.StatusServiceUnavailable,
}
}

View File

@@ -5,6 +5,7 @@ import (
"fmt"
"net/http"
"strings"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
@@ -160,4 +161,14 @@ var (
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
_ = metrics.NewGauge(`vm_metrics_with_dropped_labels_total`, func() float64 {
return float64(atomic.LoadUint64(&storage.MetricsWithDroppedLabels))
})
_ = metrics.NewGauge(`vm_too_long_label_names_total`, func() float64 {
return float64(atomic.LoadUint64(&storage.TooLongLabelNames))
})
_ = metrics.NewGauge(`vm_too_long_label_values_total`, func() float64 {
return float64(atomic.LoadUint64(&storage.TooLongLabelValues))
})
)

View File

@@ -98,9 +98,9 @@ Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` do
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmrestore
ROOT_IMAGE=scratch make package-vmrestore
```

View File

@@ -71,7 +71,7 @@ func newDstFS() (*fslocal.FS, error) {
MaxBytesPerSecond: *maxBytesPerSecond,
}
if err := fs.Init(); err != nil {
return nil, fmt.Errorf("cannot initialize local fs: %s", err)
return nil, fmt.Errorf("cannot initialize local fs: %w", err)
}
return fs, nil
}
@@ -79,7 +79,7 @@ func newDstFS() (*fslocal.FS, error) {
func newSrcFS() (common.RemoteFS, error) {
fs, err := actions.NewRemoteFS(*src)
if err != nil {
return nil, fmt.Errorf("cannot parse `-src`=%q: %s", *src, err)
return nil, fmt.Errorf("cannot parse `-src`=%q: %w", *src, err)
}
return fs, nil
}

View File

@@ -1,6 +1,7 @@
package vmselect
import (
"errors"
"flag"
"fmt"
"net/http"
@@ -240,7 +241,8 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
w.Header().Set("Content-Type", "application/json")
statusCode := http.StatusUnprocessableEntity
if esc, ok := err.(*httpserver.ErrorWithStatusCode); ok {
var esc *httpserver.ErrorWithStatusCode
if errors.As(err, &esc) {
statusCode = esc.StatusCode
}
w.WriteHeader(statusCode)

View File

@@ -7,13 +7,12 @@ import (
"runtime"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
)
@@ -72,6 +71,50 @@ func (rss *Results) mustClose() {
rss.sr = nil
}
var timeseriesWorkCh = make(chan *timeseriesWork, gomaxprocs)
type timeseriesWork struct {
rss *Results
pts *packedTimeseries
f func(rs *Result, workerID uint)
doneCh chan error
rowsProcessed int
}
func init() {
for i := 0; i < gomaxprocs; i++ {
go timeseriesWorker(uint(i))
}
}
func timeseriesWorker(workerID uint) {
var rs Result
var rsLastResetTime uint64
for tsw := range timeseriesWorkCh {
rss := tsw.rss
if time.Until(rss.deadline.Deadline) < 0 {
tsw.doneCh <- fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
continue
}
if err := tsw.pts.Unpack(&rs, rss.tr, rss.fetchData); err != nil {
tsw.doneCh <- fmt.Errorf("error during time series unpacking: %w", err)
continue
}
if len(rs.Timestamps) > 0 || !rss.fetchData {
tsw.f(&rs, workerID)
}
tsw.rowsProcessed = len(rs.Values)
tsw.doneCh <- nil
currentTime := fasttime.UnixTimestamp()
if cap(rs.Values) > 1024*1024 && 4*len(rs.Values) < cap(rs.Values) && currentTime-rsLastResetTime > 10 {
// Reset rs in order to preseve memory usage after processing big time series with millions of rows.
rs = Result{}
rsLastResetTime = currentTime
}
}
}
// RunParallel runs in parallel f for all the results from rss.
//
// f shouldn't hold references to rs after returning.
@@ -81,72 +124,36 @@ func (rss *Results) mustClose() {
func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
defer rss.mustClose()
workersCount := 1 + len(rss.packedTimeseries)/32
if workersCount > gomaxprocs {
workersCount = gomaxprocs
}
if workersCount == 0 {
logger.Panicf("BUG: workersCount cannot be zero")
}
workCh := make(chan *packedTimeseries, workersCount)
doneCh := make(chan error)
// Start workers.
rowsProcessedTotal := uint64(0)
for i := 0; i < workersCount; i++ {
go func(workerID uint) {
rs := getResult()
defer putResult(rs)
maxWorkersCount := gomaxprocs / workersCount
var err error
rowsProcessed := 0
for pts := range workCh {
if time.Until(rss.deadline.Deadline) < 0 {
err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
break
}
if err = pts.Unpack(rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
break
}
if len(rs.Timestamps) == 0 && rss.fetchData {
// Skip empty blocks.
continue
}
rowsProcessed += len(rs.Values)
f(rs, workerID)
}
atomic.AddUint64(&rowsProcessedTotal, uint64(rowsProcessed))
// Drain the remaining work
for range workCh {
}
doneCh <- err
}(uint(i))
}
// Feed workers with work.
tsws := make([]*timeseriesWork, len(rss.packedTimeseries))
for i := range rss.packedTimeseries {
workCh <- &rss.packedTimeseries[i]
tsw := &timeseriesWork{
rss: rss,
pts: &rss.packedTimeseries[i],
f: f,
doneCh: make(chan error, 1),
}
timeseriesWorkCh <- tsw
tsws[i] = tsw
}
seriesProcessedTotal := len(rss.packedTimeseries)
rss.packedTimeseries = rss.packedTimeseries[:0]
close(workCh)
// Wait until workers finish.
var errors []error
for i := 0; i < workersCount; i++ {
if err := <-doneCh; err != nil {
errors = append(errors, err)
// Wait until work is complete.
var firstErr error
rowsProcessedTotal := 0
for _, tsw := range tsws {
if err := <-tsw.doneCh; err != nil && firstErr == nil {
// Return just the first error, since other errors
// are likely duplicate the first error.
firstErr = err
}
rowsProcessedTotal += tsw.rowsProcessed
}
perQueryRowsProcessed.Update(float64(rowsProcessedTotal))
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal))
if len(errors) > 0 {
// Return just the first error, since other errors
// is likely duplicate the first error.
return errors[0]
}
return nil
return firstErr
}
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`)
@@ -159,70 +166,74 @@ type packedTimeseries struct {
brs []storage.BlockRef
}
var unpackWorkCh = make(chan *unpackWork, gomaxprocs)
type unpackWork struct {
br storage.BlockRef
tr storage.TimeRange
fetchData bool
doneCh chan error
sb *sortBlock
}
func init() {
for i := 0; i < gomaxprocs; i++ {
go unpackWorker()
}
}
func unpackWorker() {
for upw := range unpackWorkCh {
sb := getSortBlock()
if err := sb.unpackFrom(upw.br, upw.tr, upw.fetchData); err != nil {
putSortBlock(sb)
upw.doneCh <- fmt.Errorf("cannot unpack block: %w", err)
continue
}
upw.sb = sb
upw.doneCh <- nil
}
}
// Unpack unpacks pts to dst.
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool) error {
dst.reset()
if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil {
return fmt.Errorf("cannot unmarshal metricName %q: %s", pts.metricName, err)
}
workersCount := 1 + len(pts.brs)/32
if workersCount > maxWorkersCount {
workersCount = maxWorkersCount
}
if workersCount == 0 {
logger.Panicf("BUG: workersCount cannot be zero")
}
sbs := make([]*sortBlock, 0, len(pts.brs))
var sbsLock sync.Mutex
workCh := make(chan storage.BlockRef, workersCount)
doneCh := make(chan error)
// Start workers
for i := 0; i < workersCount; i++ {
go func() {
var err error
for br := range workCh {
sb := getSortBlock()
if err = sb.unpackFrom(br, tr, fetchData); err != nil {
break
}
sbsLock.Lock()
sbs = append(sbs, sb)
sbsLock.Unlock()
}
// Drain the remaining work
for range workCh {
}
doneCh <- err
}()
return fmt.Errorf("cannot unmarshal metricName %q: %w", pts.metricName, err)
}
// Feed workers with work
for _, br := range pts.brs {
workCh <- br
upws := make([]*unpackWork, len(pts.brs))
for i, br := range pts.brs {
upw := &unpackWork{
br: br,
tr: tr,
fetchData: fetchData,
doneCh: make(chan error, 1),
}
unpackWorkCh <- upw
upws[i] = upw
}
pts.brs = pts.brs[:0]
close(workCh)
// Wait until workers finish
var errors []error
for i := 0; i < workersCount; i++ {
if err := <-doneCh; err != nil {
errors = append(errors, err)
// Wait until work is complete
sbs := make([]*sortBlock, 0, len(pts.brs))
var firstErr error
for _, upw := range upws {
if err := <-upw.doneCh; err != nil && firstErr == nil {
// Return the first error only, since other errors are likely the same.
firstErr = err
}
if firstErr == nil {
sbs = append(sbs, upw.sb)
} else {
putSortBlock(upw.sb)
}
}
if len(errors) > 0 {
// Return the first error only, since other errors are likely the same.
return errors[0]
if firstErr != nil {
return firstErr
}
// Merge blocks
mergeSortBlocks(dst, sbs)
return nil
}
@@ -318,7 +329,7 @@ func (sb *sortBlock) unpackFrom(br storage.BlockRef, tr storage.TimeRange, fetch
br.MustReadBlock(&sb.b, fetchData)
if fetchData {
if err := sb.b.UnmarshalData(); err != nil {
return fmt.Errorf("cannot unmarshal block: %s", err)
return fmt.Errorf("cannot unmarshal block: %w", err)
}
}
timestamps := sb.b.Timestamps()
@@ -387,7 +398,7 @@ func DeleteSeries(sq *storage.SearchQuery) (int, error) {
func GetLabels(deadline Deadline) ([]string, error) {
labels, err := vmstorage.SearchTagKeys(*maxTagKeysPerSearch)
if err != nil {
return nil, fmt.Errorf("error during labels search: %s", err)
return nil, fmt.Errorf("error during labels search: %w", err)
}
// Substitute "" with "__name__"
@@ -413,7 +424,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
// Search for tag values
labelValues, err := vmstorage.SearchTagValues([]byte(labelName), *maxTagValuesPerSearch)
if err != nil {
return nil, fmt.Errorf("error during label values search for labelName=%q: %s", labelName, err)
return nil, fmt.Errorf("error during label values search for labelName=%q: %w", labelName, err)
}
// Sort labelValues like Prometheus does
@@ -426,7 +437,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
labelEntries, err := vmstorage.SearchTagEntries(*maxTagKeysPerSearch, *maxTagValuesPerSearch)
if err != nil {
return nil, fmt.Errorf("error during label entries request: %s", err)
return nil, fmt.Errorf("error during label entries request: %w", err)
}
// Substitute "" with "__name__"
@@ -453,7 +464,7 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) {
status, err := vmstorage.GetTSDBStatusForDate(date, topN)
if err != nil {
return nil, fmt.Errorf("error during tsdb status request: %s", err)
return nil, fmt.Errorf("error during tsdb status request: %w", err)
}
return status, nil
}
@@ -462,7 +473,7 @@ func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TS
func GetSeriesCount(deadline Deadline) (uint64, error) {
n, err := vmstorage.GetSeriesCount()
if err != nil {
return 0, fmt.Errorf("error during series count request: %s", err)
return 0, fmt.Errorf("error during series count request: %w", err)
}
return n, nil
}
@@ -495,6 +506,9 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
MinTimestamp: sq.MinTimestamp,
MaxTimestamp: sq.MaxTimestamp,
}
if err := vmstorage.CheckTimeRange(tr); err != nil {
return nil, err
}
vmstorage.WG.Add(1)
defer vmstorage.WG.Done()
@@ -518,7 +532,7 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
m[string(metricName)] = append(brs, *sr.MetricBlockRef.BlockRef)
}
if err := sr.Error(); err != nil {
return nil, fmt.Errorf("search error after reading %d data blocks: %s", blocksRead, err)
return nil, fmt.Errorf("search error after reading %d data blocks: %w", blocksRead, err)
}
var rss Results
@@ -537,25 +551,6 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
return &rss, nil
}
func getResult() *Result {
v := rsPool.Get()
if v == nil {
return &Result{}
}
return v.(*Result)
}
func putResult(rs *Result) {
if len(rs.Values) > 8192 {
// Do not pool big results, since they may occupy too much memory.
return
}
rs.reset()
rsPool.Put(rs)
}
var rsPool sync.Pool
func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error) {
tfss := make([]*storage.TagFilters, 0, len(tagFilterss))
for _, tagFilters := range tagFilterss {
@@ -563,7 +558,7 @@ func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error)
for i := range tagFilters {
tf := &tagFilters[i]
if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil {
return nil, fmt.Errorf("cannot parse tag filter %s: %s", tf, err)
return nil, fmt.Errorf("cannot parse tag filter %s: %w", tf, err)
}
}
tfss = append(tfss, tfs)

View File

@@ -46,7 +46,7 @@ const defaultStep = 5 * 60 * 1000
func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
ct := currentTime()
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse request form values: %s", err)
return fmt.Errorf("cannot parse request form values: %w", err)
}
matches := r.Form["match[]"]
if len(matches) == 0 {
@@ -82,7 +82,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
}
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
if err != nil {
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
}
resultsCh := make(chan *quicktemplate.ByteBuffer)
@@ -105,7 +105,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
err = <-doneCh
if err != nil {
return fmt.Errorf("error during data fetching: %s", err)
return fmt.Errorf("error during data fetching: %w", err)
}
federateDuration.UpdateDuration(startTime)
return nil
@@ -117,7 +117,7 @@ var federateDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/fe
func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
ct := currentTime()
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse request form values: %s", err)
return fmt.Errorf("cannot parse request form values: %w", err)
}
matches := r.Form["match[]"]
if len(matches) == 0 {
@@ -143,7 +143,7 @@ func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
end = start + defaultStep
}
if err := exportHandler(w, matches, start, end, format, maxRowsPerLine, deadline); err != nil {
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %s", matches, start, end, err)
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %w", matches, start, end, err)
}
exportDuration.UpdateDuration(startTime)
return nil
@@ -202,7 +202,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
}
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
if err != nil {
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
}
resultsCh := make(chan *quicktemplate.ByteBuffer, runtime.GOMAXPROCS(-1))
@@ -227,7 +227,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
}
err = <-doneCh
if err != nil {
return fmt.Errorf("error during data fetching: %s", err)
return fmt.Errorf("error during data fetching: %w", err)
}
return nil
}
@@ -237,7 +237,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
// See https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series
func DeleteHandler(startTime time.Time, r *http.Request) error {
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse request form values: %s", err)
return fmt.Errorf("cannot parse request form values: %w", err)
}
if r.FormValue("start") != "" || r.FormValue("end") != "" {
return fmt.Errorf("start and end aren't supported. Remove these args from the query in order to delete all the matching metrics")
@@ -255,7 +255,7 @@ func DeleteHandler(startTime time.Time, r *http.Request) error {
}
deletedCount, err := netstorage.DeleteSeries(sq)
if err != nil {
return fmt.Errorf("cannot delete time series matching %q: %s", matches, err)
return fmt.Errorf("cannot delete time series matching %q: %w", matches, err)
}
if deletedCount > 0 {
promql.ResetRollupResultCache()
@@ -273,14 +273,14 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
deadline := getDeadlineForQuery(r)
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
return fmt.Errorf("cannot parse form values: %w", err)
}
var labelValues []string
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
var err error
labelValues, err = netstorage.GetLabelValues(labelName, deadline)
if err != nil {
return fmt.Errorf(`cannot obtain label values for %q: %s`, labelName, err)
return fmt.Errorf(`cannot obtain label values for %q: %w`, labelName, err)
}
} else {
// Extended functionality that allows filtering by label filters and time range
@@ -302,7 +302,7 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
}
labelValues, err = labelValuesWithMatches(labelName, matches, start, end, deadline)
if err != nil {
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %s", labelName, matches, start, end, err)
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %w", labelName, matches, start, end, err)
}
}
@@ -343,7 +343,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
}
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
if err != nil {
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
}
m := make(map[string]struct{})
@@ -358,7 +358,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
mLock.Unlock()
})
if err != nil {
return nil, fmt.Errorf("error when data fetching: %s", err)
return nil, fmt.Errorf("error when data fetching: %w", err)
}
labelValues := make([]string, 0, len(m))
@@ -376,7 +376,7 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
deadline := getDeadlineForQuery(r)
labelEntries, err := netstorage.GetLabelEntries(deadline)
if err != nil {
return fmt.Errorf(`cannot obtain label entries: %s`, err)
return fmt.Errorf(`cannot obtain label entries: %w`, err)
}
w.Header().Set("Content-Type", "application/json")
WriteLabelsCountResponse(w, labelEntries)
@@ -394,14 +394,14 @@ const secsPerDay = 3600 * 24
func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
deadline := getDeadlineForQuery(r)
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
return fmt.Errorf("cannot parse form values: %w", err)
}
date := fasttime.UnixDate()
dateStr := r.FormValue("date")
if len(dateStr) > 0 {
t, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
return fmt.Errorf("cannot parse `date` arg %q: %w", dateStr, err)
}
date = uint64(t.Unix()) / secsPerDay
}
@@ -410,7 +410,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
if len(topNStr) > 0 {
n, err := strconv.Atoi(topNStr)
if err != nil {
return fmt.Errorf("cannot parse `topN` arg %q: %s", topNStr, err)
return fmt.Errorf("cannot parse `topN` arg %q: %w", topNStr, err)
}
if n <= 0 {
n = 1
@@ -422,7 +422,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
}
status, err := netstorage.GetTSDBStatusForDate(deadline, date, topN)
if err != nil {
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %w`, date, topN, err)
}
w.Header().Set("Content-Type", "application/json")
WriteTSDBStatusResponse(w, status)
@@ -439,14 +439,14 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
deadline := getDeadlineForQuery(r)
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
return fmt.Errorf("cannot parse form values: %w", err)
}
var labels []string
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
var err error
labels, err = netstorage.GetLabels(deadline)
if err != nil {
return fmt.Errorf("cannot obtain labels: %s", err)
return fmt.Errorf("cannot obtain labels: %w", err)
}
} else {
// Extended functionality that allows filtering by label filters and time range
@@ -466,7 +466,7 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
}
labels, err = labelsWithMatches(matches, start, end, deadline)
if err != nil {
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %s", matches, start, end, err)
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %w", matches, start, end, err)
}
}
@@ -494,7 +494,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
}
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
if err != nil {
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
}
m := make(map[string]struct{})
@@ -510,7 +510,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
mLock.Unlock()
})
if err != nil {
return nil, fmt.Errorf("error when data fetching: %s", err)
return nil, fmt.Errorf("error when data fetching: %w", err)
}
labels := make([]string, 0, len(m))
@@ -528,7 +528,7 @@ func SeriesCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
deadline := getDeadlineForQuery(r)
n, err := netstorage.GetSeriesCount(deadline)
if err != nil {
return fmt.Errorf("cannot obtain series count: %s", err)
return fmt.Errorf("cannot obtain series count: %w", err)
}
w.Header().Set("Content-Type", "application/json")
WriteSeriesCountResponse(w, n)
@@ -545,7 +545,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
ct := currentTime()
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
return fmt.Errorf("cannot parse form values: %w", err)
}
matches := r.Form["match[]"]
if len(matches) == 0 {
@@ -580,7 +580,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
}
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
if err != nil {
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
}
resultsCh := make(chan *quicktemplate.ByteBuffer)
@@ -605,7 +605,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
}
err = <-doneCh
if err != nil {
return fmt.Errorf("error during data fetching: %s", err)
return fmt.Errorf("error during data fetching: %w", err)
}
seriesDuration.UpdateDuration(startTime)
return nil
@@ -652,17 +652,17 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
if childQuery, windowStr, offsetStr := promql.IsMetricSelectorWithRollup(query); childQuery != "" {
window, err := parsePositiveDuration(windowStr, step)
if err != nil {
return fmt.Errorf("cannot parse window: %s", err)
return fmt.Errorf("cannot parse window: %w", err)
}
offset, err := parseDuration(offsetStr, step)
if err != nil {
return fmt.Errorf("cannot parse offset: %s", err)
return fmt.Errorf("cannot parse offset: %w", err)
}
start -= offset
end := start
start = end - window
if err := exportHandler(w, []string{childQuery}, start, end, "promapi", 0, deadline); err != nil {
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %s", childQuery, start, end, err)
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %w", childQuery, start, end, err)
}
queryDuration.UpdateDuration(startTime)
return nil
@@ -670,24 +670,24 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
if childQuery, windowStr, stepStr, offsetStr := promql.IsRollup(query); childQuery != "" {
newStep, err := parsePositiveDuration(stepStr, step)
if err != nil {
return fmt.Errorf("cannot parse step: %s", err)
return fmt.Errorf("cannot parse step: %w", err)
}
if newStep > 0 {
step = newStep
}
window, err := parsePositiveDuration(windowStr, step)
if err != nil {
return fmt.Errorf("cannot parse window: %s", err)
return fmt.Errorf("cannot parse window: %w", err)
}
offset, err := parseDuration(offsetStr, step)
if err != nil {
return fmt.Errorf("cannot parse offset: %s", err)
return fmt.Errorf("cannot parse offset: %w", err)
}
start -= offset
end := start
start = end - window
if err := queryRangeHandler(w, childQuery, start, end, step, r, ct); err != nil {
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", childQuery, start, end, step, err)
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", childQuery, start, end, step, err)
}
queryDuration.UpdateDuration(startTime)
return nil
@@ -702,7 +702,7 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
}
result, err := promql.Exec(&ec, query, true)
if err != nil {
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %s", query, start, step, err)
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %w", query, start, step, err)
}
w.Header().Set("Content-Type", "application/json")
@@ -750,7 +750,7 @@ func QueryRangeHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
return err
}
if err := queryRangeHandler(w, query, start, end, step, r, ct); err != nil {
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", query, start, end, step, err)
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", query, start, end, step, err)
}
queryRangeDuration.UpdateDuration(startTime)
return nil
@@ -788,7 +788,7 @@ func queryRangeHandler(w http.ResponseWriter, query string, start, end, step int
}
result, err := promql.Exec(&ec, query, false)
if err != nil {
return fmt.Errorf("cannot execute query: %s", err)
return fmt.Errorf("cannot execute query: %w", err)
}
queryOffset := getLatencyOffsetMilliseconds()
if ct-end < queryOffset {
@@ -897,7 +897,7 @@ func getTime(r *http.Request, argKey string, defaultValue int64) (int64, error)
// Try parsing duration relative to the current time
d, err1 := time.ParseDuration(argValue)
if err1 != nil {
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
}
if d > 0 {
d = -d
@@ -939,7 +939,7 @@ func getDuration(r *http.Request, argKey string, defaultValue int64) (int64, err
// Try parsing string format
d, err := time.ParseDuration(argValue)
if err != nil {
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
}
secs = d.Seconds()
}
@@ -1001,7 +1001,7 @@ func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error)
for _, match := range matches {
tagFilters, err := promql.ParseMetricSelector(match)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %s", match, err)
return nil, fmt.Errorf("cannot parse %q: %w", match, err)
}
tagFilterss = append(tagFilterss, tagFilters)
}

View File

@@ -119,7 +119,7 @@ func testIncrementalParallelAggr(iafc *incrementalAggrFuncContext, tssSrc, tssEx
wg.Wait()
tssActual := iafc.finalizeTimeseries()
if err := expectTimeseriesEqual(tssActual, tssExpected); err != nil {
return fmt.Errorf("%s; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
return fmt.Errorf("%w; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
}
return nil
}
@@ -164,7 +164,7 @@ func expectTsEqual(actual, expected *timeseries) error {
return fmt.Errorf("unexpected timestamps; got %v; want %v", actual.Timestamps, expected.Timestamps)
}
if err := compareValues(actual.Values, expected.Values); err != nil {
return fmt.Errorf("%s; actual %v; expected %v", err, actual.Values, expected.Values)
return fmt.Errorf("%w; actual %v; expected %v", err, actual.Values, expected.Values)
}
return nil
}

View File

@@ -206,7 +206,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
resetMetricGroupIfRequired(be, tsLeft)
if len(tssRight) == 1 {
// Easy case - right part contains only a single matching time series.
tsLeft.MetricName.AddMissingTags(joinTags, &tssRight[0].MetricName)
tsLeft.MetricName.SetTags(joinTags, &tssRight[0].MetricName)
rvsLeft = append(rvsLeft, tsLeft)
rvsRight = append(rvsRight, tssRight[0])
continue
@@ -225,7 +225,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
for _, tsRight := range tssRight {
var tsCopy timeseries
tsCopy.CopyFromShallowTimestamps(tsLeft)
tsCopy.MetricName.AddMissingTags(joinTags, &tsRight.MetricName)
tsCopy.MetricName.SetTags(joinTags, &tsRight.MetricName)
bb.B = marshalMetricTagsSorted(bb.B[:0], &tsCopy.MetricName)
if tsExisting := m[string(bb.B)]; tsExisting != nil {
// Try merging tsExisting with tsRight if they don't overlap.
@@ -310,9 +310,22 @@ func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
for _, tss := range mLeft {
rvs = append(rvs, tss...)
}
for k, tss := range mRight {
if mLeft[k] == nil {
rvs = append(rvs, tss...)
for k, tssRight := range mRight {
tssLeft := mLeft[k]
if tssLeft == nil {
rvs = append(rvs, tssRight...)
continue
}
// Fill gaps in tssLeft with values from tssRight as Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/552
valuesRight := tssRight[0].Values
for _, tsLeft := range tssLeft {
valuesLeft := tsLeft.Values
for i, v := range valuesLeft {
if math.IsNaN(v) {
valuesLeft[i] = valuesRight[i]
}
}
}
}
return rvs, nil

View File

@@ -160,14 +160,14 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
}
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, me.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, me.AppendString(nil), err)
}
return rv, nil
}
if re, ok := e.(*metricsql.RollupExpr); ok {
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, re.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, re.AppendString(nil), err)
}
return rv, nil
}
@@ -189,7 +189,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
}
rv, err := tf(tfa)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
}
return rv, nil
}
@@ -203,7 +203,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
}
rv, err := evalRollupFunc(ec, fe.Name, rf, e, re, nil)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
}
return rv, nil
}
@@ -240,7 +240,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
}
rv, err := af(afa)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, ae.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, ae.AppendString(nil), err)
}
return rv, nil
}
@@ -264,7 +264,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
}
rv, err := bf(bfa)
if err != nil {
return nil, fmt.Errorf(`cannot evaluate %q: %s`, be.AppendString(nil), err)
return nil, fmt.Errorf(`cannot evaluate %q: %w`, be.AppendString(nil), err)
}
return rv, nil
}
@@ -375,7 +375,7 @@ func evalRollupFuncArgs(ec *EvalConfig, fe *metricsql.FuncExpr) ([]interface{},
}
ts, err := evalExpr(ec, arg)
if err != nil {
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %s", i+1, fe.AppendString(nil), err)
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %w", i+1, fe.AppendString(nil), err)
}
args[i] = ts
}

View File

@@ -1480,7 +1480,7 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
t.Run(`label_replace(nonexisting_src)`, func(t *testing.T) {
t.Parallel()
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".+")`
r := netstorage.Result{
@@ -1491,6 +1491,21 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
t.Parallel()
q := `label_replace(label_set(time(), "foo", "foobar"), "__name__", "x${1}y", "foo", "bar(.+)")`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Timestamps: timestampsExpected,
}
r.MetricName.Tags = []storage.Tag{{
Key: []byte("foo"),
Value: []byte("foobar"),
}}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`label_replace(match)`, func(t *testing.T) {
t.Parallel()
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".*")`
@@ -1849,6 +1864,17 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`scalar or scalar`, func(t *testing.T) {
t.Parallel()
q := `time() > 1400 or 123`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{123, 123, 123, 1600, 1800, 2000},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`timseries-with-tags unless 2`, func(t *testing.T) {
t.Parallel()
q := `label_set(time(), "foo", "bar") unless 2`
@@ -1988,25 +2014,37 @@ func TestExecSuccess(t *testing.T) {
})
t.Run(`scalar * ignoring(foo) group_right vector`, func(t *testing.T) {
t.Parallel()
q := `sort_desc(2 * ignoring(foo) group_right(a,foo) (label_set(time(), "foo", "bar") or label_set(10, "foo", "qwert")))`
q := `sort_desc(label_set(2, "a", "2") * ignoring(foo,a) group_right(a) (label_set(time(), "foo", "bar", "a", "1"), label_set(10, "foo", "qwert")))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{2000, 2400, 2800, 3200, 3600, 4000},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{{
Key: []byte("foo"),
Value: []byte("bar"),
}}
r1.MetricName.Tags = []storage.Tag{
{
Key: []byte("a"),
Value: []byte("2"),
},
{
Key: []byte("foo"),
Value: []byte("bar"),
},
}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{20, 20, 20, 20, 20, 20},
Timestamps: timestampsExpected,
}
r2.MetricName.Tags = []storage.Tag{{
Key: []byte("foo"),
Value: []byte("qwert"),
}}
r2.MetricName.Tags = []storage.Tag{
{
Key: []byte("a"),
Value: []byte("2"),
},
{
Key: []byte("foo"),
Value: []byte("qwert"),
},
}
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
@@ -2321,9 +2359,9 @@ func TestExecSuccess(t *testing.T) {
t.Run(`vector + vector on group_left matching`, func(t *testing.T) {
t.Parallel()
q := `sort_desc(
(label_set(time(), "t1", "v123", "t2", "v3") or label_set(10, "t2", "v3", "xxx", "yy"))
(label_set(time(), "t1", "v123", "t2", "v3"), label_set(10, "t2", "v3", "xxx", "yy"))
+ on (foo, t2) group_left (t1, noxxx)
(label_set(100, "t1", "v1") or label_set(time(), "t2", "v3", "noxxx", "aa"))
(label_set(100, "t1", "v1"), label_set(time(), "t2", "v3", "noxxx", "aa"))
)`
r1 := netstorage.Result{
MetricName: metricNameExpected,
@@ -2335,10 +2373,6 @@ func TestExecSuccess(t *testing.T) {
Key: []byte("noxxx"),
Value: []byte("aa"),
},
{
Key: []byte("t1"),
Value: []byte("v123"),
},
{
Key: []byte("t2"),
Value: []byte("v3"),

View File

@@ -285,7 +285,7 @@ func getRollupConfigs(name string, rf rollupFunc, expr metricsql.Expr, start, en
case "aggr_over_time":
aggrFuncNames, err := getRollupAggrFuncNames(expr)
if err != nil {
return nil, nil, fmt.Errorf("invalid args to %s: %s", expr.AppendString(nil), err)
return nil, nil, fmt.Errorf("invalid args to %s: %w", expr.AppendString(nil), err)
}
for _, aggrFuncName := range aggrFuncNames {
if rollupFuncsRemoveCounterResets[aggrFuncName] {

View File

@@ -286,7 +286,7 @@ var (
var buf [8]byte
if _, err := rand.Read(buf[:]); err != nil {
// do not use logger.Panicf, since it isn't initialized yet.
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %s", err))
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %w", err))
}
return encoding.UnmarshalUint64(buf[:])
}()
@@ -414,7 +414,7 @@ func (mi *rollupResultCacheMetainfo) Unmarshal(src []byte) error {
for i := 0; i < entriesLen; i++ {
tail, err := mi.entries[i].Unmarshal(src)
if err != nil {
return fmt.Errorf("cannot unmarshal entry #%d: %s", i, err)
return fmt.Errorf("cannot unmarshal entry #%d: %w", i, err)
}
src = tail
}

View File

@@ -217,7 +217,7 @@ func (ts *timeseries) unmarshalFastNoTimestamps(src []byte) ([]byte, error) {
tail, err := unmarshalMetricNameFast(&ts.MetricName, src)
if err != nil {
return tail, fmt.Errorf("cannot unmarshal MetricName: %s", err)
return tail, fmt.Errorf("cannot unmarshal MetricName: %w", err)
}
src = tail
@@ -275,7 +275,7 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
tail, metricGroup, err := unmarshalBytesFast(src)
if err != nil {
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %s", err)
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %w", err)
}
src = tail
mn.MetricGroup = metricGroup[:len(metricGroup):len(metricGroup)]
@@ -292,13 +292,13 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
for i := range mn.Tags {
tail, key, err := unmarshalBytesFast(src)
if err != nil {
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %s", i, err)
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %w", i, err)
}
src = tail
tail, value, err := unmarshalBytesFast(src)
if err != nil {
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %s", i, err)
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %w", i, err)
}
src = tail

View File

@@ -414,7 +414,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
}
les, err := getScalar(args[0], 0)
if err != nil {
return nil, fmt.Errorf("cannot parse le: %s", err)
return nil, fmt.Errorf("cannot parse le: %w", err)
}
// Convert buckets with `vmrange` labels to buckets with `le` labels.
@@ -425,7 +425,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
if len(args) > 2 {
s, err := getString(args[2], 2)
if err != nil {
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
}
boundsLabel = s
}
@@ -513,7 +513,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
}
phis, err := getScalar(args[0], 0)
if err != nil {
return nil, fmt.Errorf("cannot parse phi: %s", err)
return nil, fmt.Errorf("cannot parse phi: %w", err)
}
// Convert buckets with `vmrange` labels to buckets with `le` labels.
@@ -524,7 +524,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
if len(args) > 2 {
s, err := getString(args[2], 2)
if err != nil {
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
}
boundsLabel = s
}
@@ -1034,7 +1034,7 @@ func transformLabelMap(tfa *transformFuncArg) ([]*timeseries, error) {
}
label, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot read label name: %s", err)
return nil, fmt.Errorf("cannot read label name: %w", err)
}
srcValues, dstValues, err := getStringPairs(args[2:])
if err != nil {
@@ -1179,7 +1179,7 @@ func transformLabelTransform(tfa *transformFuncArg) ([]*timeseries, error) {
r, err := metricsql.CompileRegexp(regex)
if err != nil {
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
}
return labelReplace(args[0], label, r, label, replacement)
}
@@ -1208,7 +1208,7 @@ func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
r, err := metricsql.CompileRegexpAnchored(regex)
if err != nil {
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
}
return labelReplace(args[0], srcLabel, r, dstLabel, replacement)
}
@@ -1219,6 +1219,9 @@ func labelReplace(tss []*timeseries, srcLabel string, r *regexp.Regexp, dstLabel
mn := &ts.MetricName
dstValue := getDstValue(mn, dstLabel)
srcValue := mn.GetTagValue(srcLabel)
if !r.Match(srcValue) {
continue
}
b := r.ReplaceAll(srcValue, replacementBytes)
*dstValue = append((*dstValue)[:0], b...)
if len(b) == 0 {
@@ -1235,7 +1238,7 @@ func transformLabelValue(tfa *transformFuncArg) ([]*timeseries, error) {
}
labelName, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot get label name: %s", err)
return nil, fmt.Errorf("cannot get label name: %w", err)
}
rvs := args[0]
for _, ts := range rvs {
@@ -1262,15 +1265,15 @@ func transformLabelMatch(tfa *transformFuncArg) ([]*timeseries, error) {
}
labelName, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot get label name: %s", err)
return nil, fmt.Errorf("cannot get label name: %w", err)
}
labelRe, err := getString(args[2], 2)
if err != nil {
return nil, fmt.Errorf("cannot get regexp: %s", err)
return nil, fmt.Errorf("cannot get regexp: %w", err)
}
r, err := metricsql.CompileRegexpAnchored(labelRe)
if err != nil {
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
}
tss := args[0]
rvs := tss[:0]
@@ -1290,15 +1293,15 @@ func transformLabelMismatch(tfa *transformFuncArg) ([]*timeseries, error) {
}
labelName, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot get label name: %s", err)
return nil, fmt.Errorf("cannot get label name: %w", err)
}
labelRe, err := getString(args[2], 2)
if err != nil {
return nil, fmt.Errorf("cannot get regexp: %s", err)
return nil, fmt.Errorf("cannot get regexp: %w", err)
}
r, err := metricsql.CompileRegexpAnchored(labelRe)
if err != nil {
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
}
tss := args[0]
rvs := tss[:0]
@@ -1398,7 +1401,7 @@ func newTransformFuncSortByLabel(isDesc bool) transformFunc {
}
label, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot parse label name for sorting: %s", err)
return nil, fmt.Errorf("cannot parse label name for sorting: %w", err)
}
rvs := args[0]
sort.SliceStable(rvs, func(i, j int) bool {

View File

@@ -9,6 +9,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -28,8 +29,27 @@ var (
bigMergeConcurrency = flag.Int("bigMergeConcurrency", 0, "The maximum number of CPU cores to use for big merges. Default value is used if set to 0")
smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of CPU cores to use for small merges. Default value is used if set to 0")
denyQueriesOutsideRetention = flag.Bool("denyQueriesOutsideRetention", false, "Whether to deny queries outside of the configured -retentionPeriod. "+
"When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. "+
"This may be useful when multiple data sources with distinct retentions are hidden behind query-tee")
)
// CheckTimeRange returns true if the given tr is denied for querying.
func CheckTimeRange(tr storage.TimeRange) error {
if !*denyQueriesOutsideRetention {
return nil
}
minAllowedTimestamp := (int64(fasttime.UnixTimestamp()) - int64(*retentionPeriod)*3600*24*30) * 1000
if tr.MinTimestamp > minAllowedTimestamp {
return nil
}
return &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("the given time range %s is outside the allowed retention of %d months according to -denyQueriesOutsideRetention", &tr, *retentionPeriod),
StatusCode: http.StatusServiceUnavailable,
}
}
// Init initializes vmstorage.
func Init() {
InitWithoutMetrics()
@@ -171,7 +191,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshotPath, err := Storage.CreateSnapshot()
if err != nil {
err = fmt.Errorf("cannot create snapshot: %s", err)
err = fmt.Errorf("cannot create snapshot: %w", err)
jsonResponseError(w, err)
return true
}
@@ -185,7 +205,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshots, err := Storage.ListSnapshots()
if err != nil {
err = fmt.Errorf("cannot list snapshots: %s", err)
err = fmt.Errorf("cannot list snapshots: %w", err)
jsonResponseError(w, err)
return true
}
@@ -202,7 +222,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshotName := r.FormValue("snapshot")
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
jsonResponseError(w, err)
return true
}
@@ -212,13 +232,13 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshots, err := Storage.ListSnapshots()
if err != nil {
err = fmt.Errorf("cannot list snapshots: %s", err)
err = fmt.Errorf("cannot list snapshots: %w", err)
jsonResponseError(w, err)
return true
}
for _, snapshotName := range snapshots {
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
jsonResponseError(w, err)
return true
}

View File

@@ -1,20 +1,11 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__inputs": [],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.7.2"
"version": "7.0.3"
},
{
"type": "panel",
@@ -36,8 +27,8 @@
},
{
"type": "panel",
"id": "table",
"name": "Table",
"id": "table-old",
"name": "Table (old)",
"version": ""
},
{
@@ -65,7 +56,7 @@
"gnetId": 10229,
"graphTooltip": 0,
"id": null,
"iteration": 1589923637424,
"iteration": 1593345560631,
"links": [
{
"icon": "doc",
@@ -96,7 +87,7 @@
"panels": [
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"gridPos": {
"h": 1,
"w": 24,
@@ -110,8 +101,14 @@
},
{
"content": "<div style=\"text-align: center; font-size: 2em\">$version</div>",
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 8,
@@ -134,8 +131,14 @@
},
{
"columns": [],
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Run VM with `-help` flag to see all the available flags with description and default values",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fontSize": "100%",
"gridPos": {
"h": 9,
@@ -216,7 +219,7 @@
"timeShift": null,
"title": "Flags",
"transform": "table",
"type": "table"
"type": "table-old"
},
{
"cacheTimeout": null,
@@ -227,8 +230,14 @@
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "How many datapoints are in storage",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "short",
"gauge": {
"maxValue": 100,
@@ -311,8 +320,14 @@
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "short",
"gauge": {
"maxValue": 100,
@@ -395,7 +410,13 @@
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "s",
"gauge": {
"maxValue": 100,
@@ -444,7 +465,7 @@
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}",
"targets": [
{
"expr": "vm_app_uptime_seconds{job=\"$job\"}",
@@ -470,7 +491,7 @@
},
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"gridPos": {
"h": 1,
"w": 24,
@@ -487,8 +508,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -581,8 +608,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -675,8 +708,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -772,8 +811,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -877,8 +922,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows how many ongoing insertions are taking place.\n* `max` - equal to number of CPU * 2\n* `current` - current number of goroutines busy with inserting rows into storage\n\nWhen `current` hits `max` constantly, it means storage is overloaded and require more CPU.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -987,8 +1038,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1078,7 +1135,7 @@
},
{
"collapsed": true,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"gridPos": {
"h": 1,
"w": 24,
@@ -1092,8 +1149,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "How many datapoints are inserted into storage per second",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1192,8 +1255,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*<ingestion_rate>`, since VictoriaMetrics pushes pending data to persistent storage every second.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1298,8 +1367,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1349,7 +1424,7 @@
"refId": "A"
},
{
"expr": "sum(vm_rows{job=\"$job\", type != \"indexdb\"}) / sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"})",
"expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", type != \"indexdb\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
@@ -1404,8 +1479,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1495,8 +1576,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1597,8 +1684,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows amount of on-disk space occupied by inverted index.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1687,8 +1780,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1776,8 +1875,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "The number of rows merged per second by storage nodes.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1865,8 +1970,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -1958,8 +2069,14 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -2052,7 +2169,7 @@
},
{
"collapsed": true,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"gridPos": {
"h": 1,
"w": 24,
@@ -2066,15 +2183,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 13
"y": 4
},
"hiddenSeries": false,
"id": 66,
@@ -2154,15 +2277,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 13
"y": 4
},
"hiddenSeries": false,
"id": 60,
@@ -2247,15 +2376,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 21
"y": 12
},
"hiddenSeries": false,
"id": 68,
@@ -2342,7 +2477,7 @@
},
{
"collapsed": true,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"gridPos": {
"h": 1,
"w": 24,
@@ -2356,15 +2491,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 44
"y": 5
},
"hiddenSeries": false,
"id": 44,
@@ -2464,14 +2605,20 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 44
"y": 5
},
"hiddenSeries": false,
"id": 57,
@@ -2555,14 +2702,20 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 52
"y": 13
},
"hiddenSeries": false,
"id": 47,
@@ -2646,14 +2799,20 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 52
"y": 13
},
"hiddenSeries": false,
"id": 42,
@@ -2736,14 +2895,20 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 60
"y": 21
},
"hiddenSeries": false,
"id": 48,
@@ -2827,15 +2992,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 60
"y": 21
},
"hiddenSeries": false,
"id": 37,
@@ -2920,15 +3091,21 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 68
"y": 29
},
"hiddenSeries": false,
"id": 49,
@@ -3014,24 +3191,42 @@
}
],
"refresh": "30s",
"schemaVersion": 22,
"schemaVersion": 25,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(vm_app_version, job)",
"current": {
"selected": false,
"text": "VictoriaMetrics",
"value": "VictoriaMetrics"
},
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": null,
"current": {},
"datasource": "$datasource",
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"hide": 0,
"includeAll": false,
"index": -1,
"label": null,
"multi": false,
"name": "job",
"options": [],
"query": "label_values(vm_app_version, job)",
"query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
@@ -3045,11 +3240,10 @@
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"datasource": "$datasource",
"definition": "label_values(vm_app_version{job=\"$job\"}, version)",
"hide": 2,
"includeAll": false,
"index": -1,
"label": null,
"multi": false,
"name": "version",
@@ -3073,7 +3267,6 @@
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
@@ -3099,8 +3292,5 @@
"timezone": "",
"title": "VictoriaMetrics",
"uid": "wNf0q_kZk",
"variables": {
"list": []
},
"version": 1
}

View File

@@ -2,9 +2,9 @@
DOCKER_NAMESPACE := victoriametrics
ROOT_IMAGE ?= scratch
CERTS_IMAGE := alpine:3.11
GO_BUILDER_IMAGE := golang:1.14.3
ROOT_IMAGE ?= alpine:3.12
CERTS_IMAGE := alpine:3.12
GO_BUILDER_IMAGE := golang:1.14.4
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _)
BASE_IMAGE := local/base:1.1.1-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _)

View File

@@ -2,7 +2,7 @@
#### Docker compose
To spin-up setup of VictoriaMetrics, Prometheus and Grafana run following command:
To spin-up setup of VictoriaMetrics, vmagent and Grafana run following command:
`docker-compose up`
@@ -13,11 +13,11 @@ VictoriaMetrics opens following ports:
* `--opentsdbListenAddr=:4242`
* `--httpListenAddr=:8428`
##### Prometheus
##### vmagent
To access service open following [link](http://localhost:9090).
Prometheus is already configured to use VictoriaMetrics as remote storage.
vmagent is used for scraping and pushing timeseries to
VictoriaMetrics instance. It accepts Prometheus-compatible
configuration `prometheus.yml` with listed targets for scraping.
##### Grafana

View File

@@ -1,18 +1,18 @@
version: '3.5'
services:
prometheus:
container_name: prometheus
image: prom/prometheus:v2.17.2
vmagent:
container_name: vmagent
image: victoriametrics/vmagent
depends_on:
- "victoriametrics"
ports:
- 9090:9090
- 8429:8429
volumes:
- promdata:/prometheus
- vmagentdata:/vmagentdata
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--promscrape.config=/etc/prometheus/prometheus.yml'
- '--remoteWrite.url=http://victoriametrics:8428/api/v1/write'
networks:
- vm_net
restart: always
@@ -35,13 +35,7 @@ services:
restart: always
grafana:
container_name: grafana
image: grafana/grafana:6.7.2
entrypoint: >
/bin/sh -c "
cd /var/lib/grafana &&
mkdir -p dashboards &&
sed 's/$${DS_PROMETHEUS}/Prometheus/g' vm.json > dashboards/vm.json &&
/run.sh"
image: grafana/grafana:7.0.3
depends_on:
- "victoriametrics"
ports:
@@ -49,12 +43,12 @@ services:
volumes:
- grafanadata:/var/lib/grafana
- ./provisioning/:/etc/grafana/provisioning/
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/vm.json
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
networks:
- vm_net
restart: always
volumes:
promdata: {}
vmagentdata: {}
vmdata: {}
grafanadata: {}
networks:

View File

@@ -1,16 +1,10 @@
global:
scrape_interval: 10s
evaluation_interval: 10s
remote_write:
- url: "http://victoriametrics:8428/api/v1/write"
queue_config:
max_samples_per_send: 10000
scrape_configs:
- job_name: 'prometheus'
- job_name: 'vmagent'
static_configs:
- targets: ['prometheus:9090']
- targets: ['vmagent:8429']
- job_name: 'victoriametrics'
static_configs:
- targets: ['victoriametrics:8428']

View File

@@ -1,14 +1,8 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoriametrics:8428
isDefault: false
isDefault: true

View File

@@ -37,3 +37,4 @@
* [Benchmarking time series workloads on Apache Kudu using TSBS](https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/)
* [What are Open Source Time Series Databases?](https://www.iunera.com/kraken/fabric/time-series-database/)
* [Evaluating performance and correctness](https://www.robustperception.io/evaluating-performance-and-correctness)
* [Running VictoriaMetrics on Raspberry PI](https://stas.starikevich.com/posts/raspberry-pi-4-prometheus/)

View File

@@ -1,7 +1,7 @@
<img alt="Victoria Metrics" src="logo.png">
# Cluster version
<img alt="Victoria Metrics" src="logo.png">
VictoriaMetrics is fast, cost-effective and scalable time series database. It can be used as a long-term remote storage for Prometheus.
It is recommended using [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics) instead of cluster version
@@ -112,11 +112,12 @@ Run `make package`. It will build the following docker images locally:
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package`.
By default images are built on top of `scratch` image. It is possible to build on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds images on top of `alpine:3.11` image:
By default images are built on top of [alpine](https://hub.docker.com/_/scratch) image in order to improve debuggability.
It is possible to build an image on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
For example, the following command builds images on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package
ROOT_IMAGE=scratch make package
```
## Operation
@@ -225,15 +226,6 @@ Steps to add `vmstorage` node:
3. Gradually restart all the `vminsert` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8400`.
### Cluster availability
* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
* The cluster remains available if at least a single `vmstorage` node exists:
- `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
- `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.
### Updating / reconfiguring cluster nodes
All the node types - `vminsert`, `vmselect` and `vmstorage` - may be updated via graceful shutdown.
@@ -244,6 +236,17 @@ Cluster should remain in working state if at least a single node of each type re
the update process. See [cluster availability](#cluster-availability) section for details.
### Cluster availability
* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
* The cluster remains available if at least a single `vmstorage` node exists:
- `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
- `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.
Data replication can be used for increasing storage durability. See [these docs](#replication-and-data-safety) for details.
### Capacity planning
Each instance type - `vminsert`, `vmselect` and `vmstorage` - can run on the most suitable hardware.
@@ -276,6 +279,18 @@ In general it is recommended increasing the number of vCPU cores and RAM per `vm
while adding new `vmselect` nodes only when old nodes are overloaded with incoming query stream.
### High availability
It is recommended to run all the components for a single cluster in the same subnetwork with high bandwidth, low latency and low error rates.
This improves cluster performance and availability.
It isn't recommended spreading components for a single cluster across multiple availability zones, since cross-AZ network usually has lower bandwidth, higher latency
and higher error rates comparing the network inside AZ.
If you need multi-AZ setup, then it is recommended running independed clusters in each AZ and setting up
[vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) in front of these clusters, so it could replicate incoming data
into all the cluster. Then [promxy](https://github.com/jacksontj/promxy) could be used for querying the data from multiple clusters.
### Helm
Helm chart simplifies managing cluster version of VictoriaMetrics in Kubernetes.
@@ -286,6 +301,17 @@ Upgrade follows `Cluster resizing procedure` under the hood.
### Replication and data safety
In order to enable application-level replication, `-replicationFactor=N` command-line flag must be passed to `vminsert`.
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
For example, when `-replicationFactor=3` is passed to `vminsert`, then it replicates all the ingested data to 3 distinct `vmstorage` nodes.
When the replication is enabled, `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect`
in order to de-duplicate replicated data during queries. It is OK if `-dedup.minScrapeInterval` exceeds 1ms
when [deduplication](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#deduplication) is used additionally to replication.
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
so it is recommended performing regular backups. See [these docs](#backups) for details.
By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs),
since they are protected from data loss and data corruption. They also provide consistently high performance
@@ -294,14 +320,6 @@ HDD-based persistent disks should be enough for the majority of use cases.
It is recommended using durable replicated persistent volumes in Kubernetes.
If `-replicationFactor=N` command-line flag is passed to `vminsert`, then `vminsert` puts `N` copies of the ingested data to distinct `vmstorage` nodes.
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable. Note that `-dedup.minScrapeInterval=1ms` command-line
flag must be passed to `vmselect` if `-replicationFactor` exceeds 1 in order to de-duplicate replicated data during queries.
It is OK if `-dedup.minScrapeInterval` exceeds 1ms.
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
so it is recommended performing regular backups. See [these docs](#backups) for details.
### Backups
@@ -341,8 +359,7 @@ Due to `KISS` cluster version of VictoriaMetrics has no the following "features"
- Fragile gossip protocols. See [failed attempt in Thanos](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm).
- Complex replication schemes, which may go nuts in unforesseen edge cases. The replication is offloaded to the underlying durable replicated storage
such as [persistent disks in Google Compute Engine](https://cloud.google.com/compute/docs/disks/#pdspecs).
- Complex replication schemes, which may go nuts in unforesseen edge cases. See [replication docs](#replication-and-data-safety) for details.
- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability.
- Automatic cluster resizing, which may cost you a lot of money if improperly configured.
- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :)

View File

@@ -2,64 +2,64 @@
### What is the main purpose of VictoriaMetrics?
To provide the best long-term [remote storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) solution for [Prometheus](https://prometheus.io/).
To provide the best monitoring solution.
### Who uses VictoriaMetrics?
See [case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
### Which features does VictoriaMetrics have?
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL).
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4).
[Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality).
* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
may be crammed into a limited storage comparing to TimescaleDB.
* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b).
* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, M3DB, Cortex, InfluxDB or TimescaleDB.
See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae)
and [comparing Thanos to VictoriaMetrics](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
* Easy operation:
* VictoriaMetrics consists of a single executable without external dependencies.
* All the configuration is done via explicit command-line flags with reasonable defaults.
* All the data is stored in a single directory pointed by `-storageDataPath` flag.
* Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
* Supports metrics' ingestion and backfilling via the following protocols:
* [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
* [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
* [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon)
if `-graphiteListenAddr` is set.
* [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set.
* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors.
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
### Which clients do you target?
The following Prometheus users may be interested in VictoriaMetrics:
- Users who don't want to bother with Prometheus' local storage operational burden - backups, replication, capacity planning, scalability, etc.
- Users with multiple Prometheus instances who want performing arbitrary queries over all the metrics collected by their Prometheus instances (aka `global querying view`).
- Users who want reducing costs for storing huge amounts of time series data.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#prominent-features).
### How to start using VictoriaMetrics?
Start with [single-node version](Single-server-VictoriaMetrics). It is easy to configure and operate. It should fit the majority of use cases.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Quick-Start).
### Is it safe to enable [remote write storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
### What is the difference between vmagent and Prometheus?
Yes. Prometheus continues writing data to local storage after enabling remote storage write, so all the existing local storage data
While both [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Prometheus may scrape Prometheus targets (aka `/metrics` pages)
according to the provided Prometheus-compatible [scrape configs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
and send data to multiple remote storage systems, vmagent has the following additional features:
- vmagent usually requires lower amounts of CPU, RAM and disk IO comparing to Prometheus when scraping big number of targets (more than 1000)
or targets with big number of exposed metrics.
- vmagent provides independent disk-backed buffers per each configured remote storage (aka `-remoteWrite.url`). This means that slow or temporarily unavailable storage
doesn't prevent from sending data to healthy storage in parallel. Prometheus uses a single shared buffer for all the configured remote storage systems (aka `remote_write->url`)
with the hardcoded retention of 2 hours.
- vmagent may accept, relabel and filter data obtained via multiple data ingestion protocols additionally to data scraped from Prometheus targets.
I.e. it supports both `pull` and `push` protocols for data ingestion.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#features) for details.
- vmagent may be used in different use cases:
- [IoT and edge monitoring](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#iot-and-edge-monitoring)
- [Drop-in replacement for Prometheus](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#drop-in-replacement-for-prometheus)
- [Replication and High Availability](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#replication-and-high-availability)
- [Relabeling and Filtering](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#relabeling-and-filtering)
- [Splitting data streams among multiple systems](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#splitting-data-streams-among-multiple-systems)
- [Prometheus remote_write proxy](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#prometheus-remote_write-proxy)
### Is it safe to enable [remote write](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
Yes. Prometheus continues writing data to local storage after enabling remote write, so all the existing local storage data
and new data is available for querying via Prometheus as usual.
It is recommended using [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) for scraping Prometheus targets
and writing data to VictoriaMetrics.
### How does VictoriaMetrics compare to other remote storage solutions for Prometheus such as [M3 from Uber](https://eng.uber.com/m3/), [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex), etc.?
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL with useful extensions for PromQL](MetricsQL). The simplicity is twofold:
- It is simpler to configure and operate. There is no need in configuring third-party [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md)
or fighting with [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
- VictoriaMetrics has simpler architecture, which means less bugs and more useful features in the long run comparing to competing TSDBs.
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL query language](MetricsQL) based on PromQL. The simplicity is twofold:
- It is simpler to configure and operate. There is no need in configuring [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md),
fighting [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md)
or setting up third-party systems such as [Consul](https://github.com/cortexproject/cortex/issues/157), [Cassandra](https://cortexmetrics.io/docs/production/cassandra/),
[DynamoDB](https://cortexmetrics.io/docs/production/aws/) or [Memcached](https://cortexmetrics.io/docs/production/caching/).
- VictoriaMetrics has simpler architecture. This means less bugs and more useful features in the long run comparing to competing TSDBs.
See [comparing Thanos to VictoriaMetrics cluster](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683)
and [Remote Write Storage Wars](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) talk from [PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
@@ -70,55 +70,68 @@ VictoriaMetrics also [uses less RAM than Thanos components](https://github.com/t
### What is the difference between VictoriaMetrics and [Cortex](https://github.com/cortexproject/cortex)?
VictoriaMetrics is similar to Cortex in the following aspects:
- Both systems accept data from Prometheus via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/),
i.e. there is no need in running sidecars unlike in [Thanos](https://github.com/thanos-io/thanos) case.
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format).
- Both systems accept data from [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or Prometheus
via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/), i.e. there is no need in running sidecars
unlike in [Thanos](https://github.com/thanos-io/thanos) case.
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#multitenancy).
- Both systems support data replication. See [replication in Cortex](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#hashing) and [replication in VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety).
- Both systems scale horizontally to multiple nodes. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#cluster-resizing-and-scalability) for details.
- Both systems support alerting and recording rules via the corresponding tools such as [vmalert](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md).
The main differences between Cortex and VictoriaMetrics:
- Cortex re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
- Cortex provides [Ruler](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ruler) and [Alertmanager](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#alertmanager) components,
which are currently missing in VictoriaMetrics. However, these components can be substituted by [Promxy](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
- Cortex heavily relies on third-party services such as Consul, Memcache, DynamoDB, BigTable, Cassandra, etc.
This may increase operational complexity and reduce system reliability comparing to VictoriaMetrics' case,
which doesn't use any external services. Compare [Cortex Architecture](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md)
to [VictoriaMetrics architecture](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#architecture-overview).
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
which is much easier to setup and operate than Cortex cluster.
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ingesters-failure-and-data-loss).
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#ingesters-failure-and-data-loss).
VictoriaMetrics may lose only a few seconds of recent data, which isn't synced to persistent storage yet.
See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) and [other case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
### What is the difference between VictoriaMetrics and [Thanos](https://github.com/thanos-io/thanos)?
- Thanos re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
- Thanos provides [Ruler component](https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md),
while VictoriaMetrics relies on [Promxy for alerting and recording rules](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
- VictoriaMetrics accepts data via [standard remote_write API for Prometheus](https://prometheus.io/docs/practices/remote_write/),
while Thanos uses non-standard [Sidecar](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md), which must run alongside each Prometheus instance.
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage.
- Thanos stores data on object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data on block storage (GCP persistent disks, Amazon EBS or bare metal HDD).
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage. See [these docs](https://thanos.io/components/sidecar.md/) for more details.
- Thanos stores data in object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data in block storage
([GCP persistent disks](https://cloud.google.com/compute/docs/disks#pdspecs), Amazon EBS or bare metal HDD).
While object storage is usually less expensive, block storage provides much lower latencies and higher throughput.
VictoriaMetrics works perfectly with HDD-based block storage - there is no need in using more expensive SSD or NVMe disks in most cases.
- Thanos may lose up to 2 hours of recent data, which wasn't uploaded yet to object storage. VictoriaMetrics may lose only a few seconds of recent data,
which isn't synced to persistent storage yet. See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
which is much easier to setup and operate than Thanos components.
- Thanos may be harder to setup and operate comparing to VictoriaMetrics, since it has more moving parts, which can be connected with less reliable networks.
See [this article for details](https://medium.com/faun/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
- Thanos is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
### How does VictoriaMetrics compare to [InfluxDB](https://www.influxdata.com/time-series-platform/influxdb/)?
VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
It is easier to configure and operate. It provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
- VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
- VictoriaMetrics provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to InfluxDB - Prometheus remote_write, OpenTSDB, Graphite, CSV.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
### How does VictoriaMetrics compare to [TimescaleDB](https://www.timescale.com/)?
TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
Additionally, VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data.
- TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
- VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data. The gap in storage space usage can be lowered from 70x to 3x if [compression in TimescaleDB is properly configured](https://docs.timescale.com/latest/using-timescaledb/compression) (it isn't an easy task in general case :)).
- VictoriaMetrics accepts data in multiple popular data ingestion protocols - InfluxDB, OpenTSDB, Graphite, CSV, while TimescaleDB supports only SQL inserts.
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex)?
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos) or [Cortex](https://github.com/cortexproject/cortex)?
No. VictoriaMetrics core is written in Go from scratch by [fasthttp](https://github.com/valyala/fasthttp) [author](https://github.com/valyala).
The architecture is [optimized for storing and querying large amounts of time series data with high cardinality](https://medium.com/devopslinks/victoriametrics-creating-the-best-remote-storage-for-prometheus-5d92d66787ac). VictoriaMetrics storage uses [certain ideas from ClickHouse](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282). Special thanks to [Alexey Milovidov](https://github.com/alexey-milovidov).
@@ -151,7 +164,7 @@ The following commercial versions of VictoriaMetrics are planned:
* Managed cluster in the Cloud.
* SaaS version.
[Contact us](mailto:info@victoriametrics.com) for more information and for the pricing.
[Contact us](mailto:info@victoriametrics.com) for more information on our plans.
### Why VictoriaMetrics doesn't support [Prometheus remote read API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#%3Cremote_read%3E)?

View File

@@ -0,0 +1,74 @@
# Sample size calculations
These calculations are for the “Lowest sample size” graph at https://victoriametrics.com/ .
How many metrics can be stored in 2tb disk for 2 years?
Seconds in 2 years:
2 years * 365 days * 24 hours * 60 minutes * 60 seconds = 63072000 seconds
Resolution = 1 point per 10 second
That means each metric will contain 6307200 points.
2tb disk contains
2 (tb) * 1024 (gb) * 1024 (mb) * 1024 (kb) * 1024 (b) = 2199023255552 bytes
# VictoriaMetrics
Based on production data from our customers, sample size is 0.4 byte
That means one metric with 10 seconds resolution will need
6307200 points * 0.4 bytes/point = 2522880 bytes or 2.4 megabytes.
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 (disk size) / 2522880 (one metric for 2 year) = 871632 metrics
So in 2tb we can store 871 632 metrics
# Graphite
Based on https://m30m.github.io/whisper-calculator/ sample size of graphite metrics is 12b + 28b for each metric
That means, one metric with 10 second resolution will need 75686428 bytes or 72.18 megabytes
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 / 75686428 = 29 054 metrics
# OpenTSDB
Let's check official openTSDB site
http://opentsdb.net/faq.html
16 bytes of HBase overhead, 3 bytes for the metric, 4 bytes for the timestamp, 6 bytes per tag, 2 bytes of OpenTSDB overhead, up to 8 bytes for the value. Integers are stored with variable length encoding and can consume 1, 2, 4 or 8 bytes.
That means, one metric with 10 second resolution will need
6307200 * (1 + 4) + 3 + 16 + 2 = 31536021 bytes or 30 megabytes in the best scenario and
6307200 * (8 + 4) + 3 + 16 + 2 = 75686421 bytes or 72 megabytes in the worst scenario.
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 / 31536021 = 69 730 metrics for best scenario
2199023255552 / 75686421 = 29 054 metrics for worst scenario
Also, openTSDB allows to use compression
" LZO is able to achieve a compression factor of 4.2x "
So, let's multiply numbers on 4.2
69 730 * 4,2 = 292 866 metrics for best scenario
29 054 * 4,2 = 122 026 metrics for worst scenario
# m3db
Let's look at official m3db site https://m3db.github.io/m3/m3db/architecture/engine/
They can achieve a sample size of 1.45 bytes/datapoint
That means, one metric with 10 second resolution will need 9145440 bytes or 8,72177124 megabytes
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 / 9145440 = 240 450 metrics
# InfluxDB
Based on official influxDB site https://docs.influxdata.com/influxdb/v1.8/guides/hardware_sizing/#bytes-and-compression
"Non-string values require approximately three bytes". That means, one metric with 10 second resolution will need
6307200 * 3 = 18921600 bytes or 18 megabytes
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 / 18921600 = 116 217 metrics
# Prometheus
Let's check official site: https://prometheus.io/docs/prometheus/latest/storage/
"On average, Prometheus uses only around 1-2 bytes per sample."
That means, one metric with 10 second resolution will need
6307200 * 1 = 6307200 bytes in best scenario
6307200 * 2 = 12614400 bytes in worst scenario.
Calculation for number of metrics can be stored in 2 tb disk:
2199023255552 / 6307200 = 348 652 metrics for the best case
2199023255552 / 12614400 = 174 326 metrics for the worst cases

View File

@@ -21,6 +21,7 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
## Case studies and talks
@@ -149,9 +150,9 @@ The following command-line flags are used the most:
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
Other flags have good enough default values, so set them only if you really need this.
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
Pass `-help` to see all the available flags with description and default values.
@@ -582,17 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
ROOT_IMAGE=scratch make package-victoria-metrics
```
### Start with docker-compose
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
### Setting up service
@@ -774,7 +776,13 @@ The required resources for query path:
### High availability
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
2) Add addresses of these instances to `remote_write` section in Prometheus config:
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
```bash
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
```
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
```yml
remote_write:
@@ -834,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
so it could route requests from particular user to VictoriaMetrics with the desired retention.
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
### Downsampling
There is no downsampling support at the moment, but:
@@ -981,6 +994,10 @@ The most interesting metrics are:
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
while `topN` equals to 10.
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
### Backfilling
@@ -1000,15 +1017,12 @@ for data with timestamps close to the current time.
### Replication
Single-node VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
See [backup docs](#backups) for details.
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
Cluster version of VictoriaMetrics supports replication. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety)
for details.
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
See also [high availability docs](#high-availability) and [backup docs](#backups).
### Backups

View File

@@ -170,6 +170,8 @@ Additionally it provides the following extra actions:
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
The relabeling can be defined in the following places:
@@ -210,6 +212,14 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
```yml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
### How to build from sources
@@ -234,11 +244,11 @@ Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmagent
ROOT_IMAGE=scratch make package-vmagent
```

View File

@@ -1,19 +1,27 @@
## VM Alert
## vmalert
`vmalert` executes a list of given MetricsQL expressions (rules) and
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
rules against configured address.
### Features:
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
expressions validation;
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Lightweight without extra dependencies.
### TODO:
* Support recording rules.
### Limitations:
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
may fail;
* by default, rules execution is sequential within one group, but persisting of execution results to remote
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
recording rule is reused in next one;
* there is no `query` function support in templates yet;
* `vmalert` has no UI, just an API for getting groups and rules statuses.
### QuickStart
@@ -26,10 +34,12 @@ make vmalert
The build binary will be placed to `VictoriaMetrics/bin` folder.
To start using `vmalert` you will need the following things:
* list of alert rules - PromQL/MetricsQL expressions to execute;
* list of rules - PromQL/MetricsQL expressions to execute;
* datasource address - reachable VictoriaMetrics instance for rules execution;
* notifier address - reachable Alertmanager instance for processing,
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
aggregating alerts and sending notifications.
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
Then configure `vmalert` accordingly:
```
@@ -38,22 +48,106 @@ Then configure `vmalert` accordingly:
-notifier.url=http://localhost:9093
```
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
```yaml
groups:
[ - <rule_group> ]
```
`vmalert` runs evaluation for every group in a separate goroutine.
Rules in group evaluated one-by-one sequentially.
#### Groups
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
Each group has following attributes:
```yaml
# The name of the group. Must be unique within a file.
name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = global.evaluation_interval ]
# How many rules execute at once. Increasing concurrency may speed
# up round execution speed.
[ concurrency: <integer> | default = 1 ]
rules:
[ - <rule> ... ]
```
#### Rules
There are two types of Rules:
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allow you to precompute frequently needed or computationally expensive expressions
and save their result as a new set of time series.
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
within one group.
##### Alerting rules
The syntax for alerting rule is following:
```yaml
# The name of the alert. Must be a valid metric name.
alert: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
[ for: <duration> | default = 0s ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]
# Annotations to add to each alert.
annotations:
[ <labelname>: <tmpl_string> ]
```
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
address in form of timeseries with name `ALERTS` via remote-write protocol.
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
address by querying `ALERTS` timeseries.
##### Recording rules
The syntax for recording rules is following:
```yaml
# The name of the time series to output to. Must be a valid metric name.
record: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Labels to add or overwrite before storing the result.
labels:
[ <labelname>: <labelvalue> ]
```
For recording rules to work `-remoteWrite.url` must specified.
#### WEB
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
### Configuration
@@ -64,22 +158,36 @@ Usage of vmalert:
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
-datasource.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
-datasource.tlsServerName value
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-evaluationInterval duration
How often to evaluate the rules. Default 1m (default 1m0s)
How often to evaluate the rules (default 1m0s)
-external.url string
External URL is used as alert's source for sent alerts to the notifier
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-notifier.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
-notifier.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
-notifier.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -notifier.url
-notifier.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
-notifier.tlsServerName value
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
-remoteRead.basicAuth.password string
@@ -88,14 +196,40 @@ Usage of vmalert:
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
-remoteRead.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
-remoteRead.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
-remoteRead.tlsServerName value
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
-remoteWrite.maxQueueSize
Defines the max number of pending datapoints to remote write endpoint
-remoteWrite.concurrency int
Defines number of readers that concurrently write into remote storage (default 1)
-remoteWrite.flushInterval duration
Defines interval of flushes to remote write endpoint (default 5s)
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
-remoteWrite.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
-remoteWrite.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
-remoteWrite.tlsServerName value
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
@@ -105,8 +239,10 @@ Usage of vmalert:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Indicates to validate annotation and label templates (default true)
Whether to validate annotation and label templates (default true)
```
Pass `-help` to `vmalert` in order to see the full list of supported

View File

@@ -23,7 +23,7 @@ Docker images for `vmauth` are available [here](https://hub.docker.com/r/victori
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, accounting, limits, etc.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
### Auth config
@@ -110,11 +110,11 @@ Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker i
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmauth
ROOT_IMAGE=scratch make package-vmauth
```

View File

@@ -89,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
### How does it work?
@@ -121,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
### Advanced usage
@@ -197,9 +201,9 @@ Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` dock
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmbackup
ROOT_IMAGE=scratch make package-vmbackup
```

View File

@@ -98,9 +98,9 @@ Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` do
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmrestore
ROOT_IMAGE=scratch make package-vmrestore
```

24
go.mod
View File

@@ -1,7 +1,8 @@
module github.com/VictoriaMetrics/VictoriaMetrics
require (
cloud.google.com/go/storage v1.8.0
cloud.google.com/go v0.60.0 // indirect
cloud.google.com/go/storage v1.10.0
github.com/VictoriaMetrics/fastcache v1.5.7
// Do not use the original github.com/valyala/fasthttp because of issues
@@ -9,25 +10,24 @@ require (
github.com/VictoriaMetrics/fasthttp v1.0.1
github.com/VictoriaMetrics/metrics v1.11.3
github.com/VictoriaMetrics/metricsql v0.2.3
github.com/aws/aws-sdk-go v1.31.5
github.com/aws/aws-sdk-go v1.32.13
github.com/cespare/xxhash/v2 v2.1.1
github.com/golang/protobuf v1.4.2 // indirect
github.com/golang/snappy v0.0.1
github.com/klauspost/compress v1.10.6
github.com/valyala/fastjson v1.5.1
github.com/klauspost/compress v1.10.10
github.com/valyala/fastjson v1.5.3
github.com/valyala/fastrand v1.0.0
github.com/valyala/gozstd v1.7.0
github.com/valyala/histogram v1.0.1
github.com/valyala/quicktemplate v1.5.0
golang.org/x/mod v0.3.0 // indirect
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 // indirect
go.opencensus.io v0.22.4 // indirect
golang.org/x/net v0.0.0-20200625001655-4c5254603344 // indirect
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
golang.org/x/sys v0.0.0-20200523222454-059865788121
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 // indirect
google.golang.org/api v0.25.0
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece // indirect
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae
golang.org/x/text v0.3.3 // indirect
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632 // indirect
google.golang.org/api v0.28.0
google.golang.org/grpc v1.30.0 // indirect
gopkg.in/yaml.v2 v2.3.0
honnef.co/go/tools v0.0.1-2020.1.4 // indirect
)
go 1.13

55
go.sum
View File

@@ -13,6 +13,8 @@ cloud.google.com/go v0.56.0 h1:WRz29PgAsVEyPSDHyk+0fpEkwEFyfhHn+JbksT6gIL4=
cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
cloud.google.com/go v0.57.0 h1:EpMNVUorLiZIELdMZbCYX/ByTFCdoYopYAGxaGVz9ms=
cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
cloud.google.com/go v0.60.0 h1:R+tDlceO7Ss+zyvtsdhTxacDyZ1k99xwskQ4FT7ruoM=
cloud.google.com/go v0.60.0/go.mod h1:yw2G51M9IfRboUH61Us8GqCeF1PzPblB823Mn2q2eAU=
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
cloud.google.com/go/bigquery v1.4.0 h1:xE3CPsOgttP4ACBePh79zTKALtXwn/Edhcr16R5hMWU=
@@ -20,6 +22,8 @@ cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvf
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
cloud.google.com/go/bigquery v1.7.0 h1:a/O/bK/vWrYGOTFtH8di4rBxMZnmkjy+Y5LxpDwo+dA=
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
cloud.google.com/go/bigquery v1.8.0 h1:PQcPefKFdaIzjQFbiyOgAqyx8q5djaE7x9Sqe712DPA=
cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/datastore v1.1.0 h1:/May9ojXjRkPBNVrq+oWLqmWCkr4OU5uRY29bu0mRyQ=
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
@@ -35,6 +39,8 @@ cloud.google.com/go/storage v1.6.0 h1:UDpwYIwla4jHGzZJaEJYx1tOejbgSoNqsAfHAUYe2r
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
cloud.google.com/go/storage v1.8.0 h1:86K1Gel7BQ9/WmNWn7dTKMvTLFzwtBe5FNqYbi9X35g=
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
cloud.google.com/go/storage v1.10.0 h1:STgFzyU5/8miMl0//zKh2aQeTyeaUH3WN9bSUiJ09bA=
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
@@ -51,8 +57,8 @@ github.com/VictoriaMetrics/metricsql v0.2.3 h1:xGscDmLoeIV7+8qX/mdHnOY0vu4m+wHIV
github.com/VictoriaMetrics/metricsql v0.2.3/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
github.com/aws/aws-sdk-go v1.31.5 h1:DFA7BzTydO4etqsTja+x7UfkOKQUv1xzEluLvNk81L0=
github.com/aws/aws-sdk-go v1.31.5/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
github.com/aws/aws-sdk-go v1.32.13 h1:zzyXF7SUxJcJa3hTcYCl1/Ey+kh2N8TjK5tWnL0wieo=
github.com/aws/aws-sdk-go v1.32.13/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -108,6 +114,10 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.4.1 h1:/exdXoGamhu5ONeUJH0deniYLWYvQwW66yvlfiiKTu0=
github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
@@ -116,6 +126,7 @@ github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hf
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200507031123-427632fa3b1c/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
@@ -132,8 +143,8 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.5 h1:7q6vHIqubShURwQz8cQK6yIe/xC3IF0Vm7TGfqjewrc=
github.com/klauspost/compress v1.10.5/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.6 h1:SP6zavvTG3YjOosWePXFDlExpKIWMTO4SE/Y8MZB2vI=
github.com/klauspost/compress v1.10.6/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.10 h1:a/y8CglcM7gLGYmlbP/stPE5sR3hbhFRUjCBfd/0B3I=
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -152,8 +163,8 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.12.0/go.mod h1:229t1eWu9UXTPmoUkbpN/fctKPBY4IJoFXQnxHGXy6E=
github.com/valyala/fastjson v1.5.1 h1:SXaQZVSwLjZOVhDEhjiCcDtnX0Feu7Z7A1+C5atpoHM=
github.com/valyala/fastjson v1.5.1/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
github.com/valyala/fastjson v1.5.3 h1:z4Z1Bll4WaXo+FXJoiCdW8ss7sKY2d/jYfE2ZzoT284=
github.com/valyala/fastjson v1.5.3/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
github.com/valyala/fastrand v1.0.0 h1:LUKT9aKer2dVQNUi3waewTbKV+7H17kvWFNKs2ObdkI=
github.com/valyala/fastrand v1.0.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ=
github.com/valyala/gozstd v1.7.0 h1:Ljh5c9zboqLhwTI33al32R72iCZfn0mCbVGcFWbGwRQ=
@@ -170,10 +181,13 @@ go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.3 h1:8sGtKOrtQqkN1bp2AtX+misvLIlOmsEsNd+9NIcPEm8=
go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.4 h1:LYy1Hy3MJdrCdMwwzxA/dRok4ejH+RwNGbuoD9fCjto=
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -229,8 +243,11 @@ golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 h1:WQ8q63x+f/zpC8Ac1s9wLElVo
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 h1:eDrdRpKgkcCqKZQwyZRyeFZgfqt37SL7Kv3tok06cKE=
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200625001655-4c5254603344 h1:vGXIOMxbNfDTk/aXCmfdLgkrSV+Z2tcbze+pEc3v5W4=
golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -268,13 +285,18 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25 h1:OKbAoGs4fGM5cPLlVQLZGYkFC8OnOfgo6tt0Smf9XhM=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200523222454-059865788121 h1:rITEj+UZHYC927n8GT97eC3zrpzXdb/voyeOuVKS46o=
golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae h1:Ih9Yo4hSPImZOpfGuA4bR/ORKTAbhZo2AbWNRCnevdo=
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -313,8 +335,11 @@ golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjs
golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 h1:3KBjmg2slvQXATWW9cQJ6tsRc8hj1gsnwWyi1IzYk3o=
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200626171337-aa94e735be7f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632 h1:fxWeLV4ol1icb47+btwfzuf8gOkJE/PPo9bLjSvZzA8=
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
@@ -333,8 +358,8 @@ google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/
google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.24.0 h1:cG03eaksBzhfSIk7JRGctfp3lanklcOM/mTGvow7BbQ=
google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.25.0 h1:LodzhlzZEUfhXzNUMIfVlf9Gr6Ua5MMtoFWh7+f47qA=
google.golang.org/api v0.25.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.28.0 h1:jMF5hhVfMkTZwHW1SDpKq5CkgWLXOb31Foaca9Zr3oM=
google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
@@ -367,9 +392,11 @@ google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84 h1:pSLkPbrjnPyLDYU
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380 h1:xriR1EgvKfkKxIoU2uUvrMVl+H26359loFFUleSMXFo=
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U=
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece h1:1YM0uhfumvoDu9sx8+RyWwTI63zoCQvI23IYFRlvte0=
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
google.golang.org/genproto v0.0.0-20200626011028-ee7919e894b5 h1:a/Sqq5B3dGnmxhuJZIHFsIxhEkqElErr5TaU6IqBAj0=
google.golang.org/genproto v0.0.0-20200626011028-ee7919e894b5/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
@@ -381,6 +408,8 @@ google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8
google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
google.golang.org/grpc v1.29.1 h1:EC2SB8S04d2r73uptxphDSUG+kTKVgjRPF+N3xpxRB4=
google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
google.golang.org/grpc v1.30.0 h1:M5a8xTlYTxwMn5ZFkwhRabsygDY5G8TYLyQDBxJNAxE=
google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -394,6 +423,8 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c=
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View File

@@ -55,13 +55,13 @@ func (b *Backup) Run() error {
}
if err := dst.DeleteFile(fscommon.BackupCompleteFilename); err != nil {
return fmt.Errorf("cannot delete `backup complete` file at %s: %s", dst, err)
return fmt.Errorf("cannot delete `backup complete` file at %s: %w", dst, err)
}
if err := runBackup(src, dst, origin, concurrency); err != nil {
return err
}
if err := dst.CreateFile(fscommon.BackupCompleteFilename, []byte("ok")); err != nil {
return fmt.Errorf("cannot create `backup complete` file at %s: %s", dst, err)
return fmt.Errorf("cannot create `backup complete` file at %s: %w", dst, err)
}
return nil
}
@@ -74,17 +74,17 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
logger.Infof("obtaining list of parts at %s", src)
srcParts, err := src.ListParts()
if err != nil {
return fmt.Errorf("cannot list src parts: %s", err)
return fmt.Errorf("cannot list src parts: %w", err)
}
logger.Infof("obtaining list of parts at %s", dst)
dstParts, err := dst.ListParts()
if err != nil {
return fmt.Errorf("cannot list dst parts: %s", err)
return fmt.Errorf("cannot list dst parts: %w", err)
}
logger.Infof("obtaining list of parts at %s", origin)
originParts, err := origin.ListParts()
if err != nil {
return fmt.Errorf("cannot list origin parts: %s", err)
return fmt.Errorf("cannot list origin parts: %w", err)
}
backupSize := getPartsSize(srcParts)
@@ -97,7 +97,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
err = runParallel(concurrency, partsToDelete, func(p common.Part) error {
logger.Infof("deleting %s from %s", &p, dst)
if err := dst.DeletePart(p); err != nil {
return fmt.Errorf("cannot delete %s from %s: %s", &p, dst, err)
return fmt.Errorf("cannot delete %s from %s: %w", &p, dst, err)
}
atomic.AddUint64(&deletedParts, 1)
return nil
@@ -109,7 +109,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
return err
}
if err := dst.RemoveEmptyDirs(); err != nil {
return fmt.Errorf("cannot remove empty directories at %s: %s", dst, err)
return fmt.Errorf("cannot remove empty directories at %s: %w", dst, err)
}
}
@@ -122,7 +122,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
err = runParallel(concurrency, originCopyParts, func(p common.Part) error {
logger.Infof("server-side copying %s from %s to %s", &p, origin, dst)
if err := dst.CopyPart(origin, p); err != nil {
return fmt.Errorf("cannot copy %s from %s to %s: %s", &p, origin, dst, err)
return fmt.Errorf("cannot copy %s from %s to %s: %w", &p, origin, dst, err)
}
atomic.AddUint64(&copiedParts, 1)
return nil
@@ -144,17 +144,17 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
logger.Infof("uploading %s from %s to %s", &p, src, dst)
rc, err := src.NewReadCloser(p)
if err != nil {
return fmt.Errorf("cannot create reader for %s from %s: %s", &p, src, err)
return fmt.Errorf("cannot create reader for %s from %s: %w", &p, src, err)
}
sr := &statReader{
r: rc,
bytesRead: &bytesUploaded,
}
if err := dst.UploadPart(p, sr); err != nil {
return fmt.Errorf("cannot upload %s to %s: %s", &p, dst, err)
return fmt.Errorf("cannot upload %s to %s: %w", &p, dst, err)
}
if err = rc.Close(); err != nil {
return fmt.Errorf("cannot close reader for %s from %s: %s", &p, src, err)
return fmt.Errorf("cannot close reader for %s from %s: %w", &p, src, err)
}
return nil
}, func(elapsed time.Duration) {

View File

@@ -43,11 +43,11 @@ func (r *Restore) Run() error {
// Make sure VictoriaMetrics doesn't run during the restore process.
if err := fs.MkdirAllIfNotExist(r.Dst.Dir); err != nil {
return fmt.Errorf("cannot create dir %q: %s", r.Dst.Dir, err)
return fmt.Errorf("cannot create dir %q: %w", r.Dst.Dir, err)
}
flockF, err := fs.CreateFlockFile(r.Dst.Dir)
if err != nil {
return fmt.Errorf("cannot create lock file in %q; make sure VictoriaMetrics doesn't use the dir; error: %s", r.Dst.Dir, err)
return fmt.Errorf("cannot create lock file in %q; make sure VictoriaMetrics doesn't use the dir; error: %w", r.Dst.Dir, err)
}
defer fs.MustClose(flockF)
@@ -71,12 +71,12 @@ func (r *Restore) Run() error {
logger.Infof("obtaining list of parts at %s", src)
srcParts, err := src.ListParts()
if err != nil {
return fmt.Errorf("cannot list src parts: %s", err)
return fmt.Errorf("cannot list src parts: %w", err)
}
logger.Infof("obtaining list of parts at %s", dst)
dstParts, err := dst.ListParts()
if err != nil {
return fmt.Errorf("cannot list dst parts: %s", err)
return fmt.Errorf("cannot list dst parts: %w", err)
}
backupSize := getPartsSize(srcParts)
@@ -129,7 +129,7 @@ func (r *Restore) Run() error {
logger.Infof("deleting %s from %s", path, dst)
size, err := dst.DeletePath(path)
if err != nil {
return fmt.Errorf("cannot delete %s from %s: %s", path, dst, err)
return fmt.Errorf("cannot delete %s from %s: %w", path, dst, err)
}
deleteSize += size
}
@@ -137,14 +137,14 @@ func (r *Restore) Run() error {
return err
}
if err := dst.RemoveEmptyDirs(); err != nil {
return fmt.Errorf("cannot remove empty directories at %s: %s", dst, err)
return fmt.Errorf("cannot remove empty directories at %s: %w", dst, err)
}
}
// Re-read dstParts, since additional parts may be removed on the previous step.
dstParts, err = dst.ListParts()
if err != nil {
return fmt.Errorf("cannot list dst parts after the deletion: %s", err)
return fmt.Errorf("cannot list dst parts after the deletion: %w", err)
}
partsToCopy := common.PartsDifference(srcParts, dstParts)
@@ -166,17 +166,17 @@ func (r *Restore) Run() error {
logger.Infof("downloading %s from %s to %s", &p, src, dst)
wc, err := dst.NewWriteCloser(p)
if err != nil {
return fmt.Errorf("cannot create writer for %q to %s: %s", &p, dst, err)
return fmt.Errorf("cannot create writer for %q to %s: %w", &p, dst, err)
}
sw := &statWriter{
w: wc,
bytesWritten: &bytesDownloaded,
}
if err := src.DownloadPart(p, sw); err != nil {
return fmt.Errorf("cannot download %s to %s: %s", &p, dst, err)
return fmt.Errorf("cannot download %s to %s: %w", &p, dst, err)
}
if err := wc.Close(); err != nil {
return fmt.Errorf("cannot close reader from %s from %s: %s", &p, src, err)
return fmt.Errorf("cannot close reader from %s from %s: %w", &p, src, err)
}
}
return nil

Some files were not shown because too many files have changed in this diff Show More