mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-07 10:56:50 +03:00
Compare commits
94 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8da3f773ae | ||
|
|
9d5f5b6878 | ||
|
|
9a2ba5b6d1 | ||
|
|
b277ba8121 | ||
|
|
84a37098ed | ||
|
|
56ccfa5218 | ||
|
|
7c2c8b2981 | ||
|
|
d5dddb0953 | ||
|
|
586c5be404 | ||
|
|
1cd01b5359 | ||
|
|
88538df267 | ||
|
|
63e5ee0d29 | ||
|
|
eba4e92994 | ||
|
|
82ecfa3b32 | ||
|
|
dc4e3f0e0b | ||
|
|
8f2e88234f | ||
|
|
423825695f | ||
|
|
5dc0bf6d3d | ||
|
|
7eb171182b | ||
|
|
05d754d7bb | ||
|
|
8dec17470d | ||
|
|
5e35b87c3d | ||
|
|
c85d926569 | ||
|
|
f0cef4761b | ||
|
|
774f7ca1c1 | ||
|
|
a560b4788e | ||
|
|
8141541e61 | ||
|
|
e65b4cb6b1 | ||
|
|
7209d58fbd | ||
|
|
72c90bfd8b | ||
|
|
2a39ba639d | ||
|
|
8f0bcec6cc | ||
|
|
a13cd60c6f | ||
|
|
c970cb912c | ||
|
|
b5206ce33f | ||
|
|
4c7f216dfe | ||
|
|
530f7a21e8 | ||
|
|
7532dbcdf5 | ||
|
|
7ec6711f06 | ||
|
|
e149019c00 | ||
|
|
7bf2cbad32 | ||
|
|
6ff821c70d | ||
|
|
a43be95e83 | ||
|
|
f689164711 | ||
|
|
7976ec8bb1 | ||
|
|
9f3e3a4d7a | ||
|
|
57acbf5491 | ||
|
|
5820c0ffb7 | ||
|
|
ac3700ed1e | ||
|
|
b542e50680 | ||
|
|
818abca8f1 | ||
|
|
50c0d8c17d | ||
|
|
88e1b7d144 | ||
|
|
08495360b0 | ||
|
|
a12364ad37 | ||
|
|
e91d758831 | ||
|
|
3d63a79b91 | ||
|
|
e53ee763f9 | ||
|
|
ae1cc0fc4b | ||
|
|
e426434770 | ||
|
|
3e277020a5 | ||
|
|
ffa75c423d | ||
|
|
0bba630f55 | ||
|
|
2382053d32 | ||
|
|
69a647b0d2 | ||
|
|
f5dd2a71a6 | ||
|
|
4b98e436ef | ||
|
|
4e8d6b80e0 | ||
|
|
d120197676 | ||
|
|
4cb3af1a36 | ||
|
|
0d92abfbf6 | ||
|
|
ff1a725a56 | ||
|
|
05ae1472e3 | ||
|
|
4fd3f6f991 | ||
|
|
6281549f31 | ||
|
|
9f4e86ac2f | ||
|
|
af49c5bdf6 | ||
|
|
a47a05dfd2 | ||
|
|
3d4008263f | ||
|
|
72ff05255f | ||
|
|
a99d606220 | ||
|
|
f8692a1d43 | ||
|
|
78b28a03b6 | ||
|
|
854f40acf2 | ||
|
|
6d059b28bf | ||
|
|
f26b94cfb6 | ||
|
|
937338abdf | ||
|
|
f2b04f2efe | ||
|
|
ff624c9125 | ||
|
|
13b1358c07 | ||
|
|
f7ff809c1e | ||
|
|
ab6e994bab | ||
|
|
d2f30e8d79 | ||
|
|
270552fde4 |
8
Makefile
8
Makefile
@@ -145,3 +145,11 @@ golangci-lint: install-golangci-lint
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
|
||||
docs-sync:
|
||||
cp app/vmagent/README.md docs/vmagent.md
|
||||
cp app/vmalert/README.md docs/vmalert.md
|
||||
cp app/vmauth/README.md docs/vmauth.md
|
||||
cp app/vmbackup/README.md docs/vmbackup.md
|
||||
cp app/vmrestore/README.md docs/vmrestore.md
|
||||
cp README.md docs/Single-server-VictoriaMetrics.md
|
||||
|
||||
40
README.md
40
README.md
@@ -21,6 +21,7 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
@@ -149,9 +150,9 @@ The following command-line flags are used the most:
|
||||
|
||||
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
|
||||
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
|
||||
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this.
|
||||
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
|
||||
|
||||
Pass `-help` to see all the available flags with description and default values.
|
||||
|
||||
@@ -582,17 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
|
||||
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
|
||||
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
|
||||
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
|
||||
|
||||
### Setting up service
|
||||
@@ -774,7 +776,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -834,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
|
||||
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
|
||||
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
|
||||
|
||||
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
|
||||
so it could route requests from particular user to VictoriaMetrics with the desired retention.
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
|
||||
|
||||
### Downsampling
|
||||
|
||||
There is no downsampling support at the moment, but:
|
||||
@@ -981,6 +994,10 @@ The most interesting metrics are:
|
||||
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
|
||||
while `topN` equals to 10.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
|
||||
|
||||
|
||||
### Backfilling
|
||||
|
||||
@@ -1000,15 +1017,12 @@ for data with timestamps close to the current time.
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
|
||||
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
|
||||
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
|
||||
See [backup docs](#backups) for details.
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Cluster version of VictoriaMetrics supports replication. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety)
|
||||
for details.
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
@@ -170,6 +170,8 @@ Additionally it provides the following extra actions:
|
||||
|
||||
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
|
||||
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
@@ -210,6 +212,14 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
|
||||
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
|
||||
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
|
||||
### How to build from sources
|
||||
|
||||
@@ -234,11 +244,11 @@ Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmagent
|
||||
ROOT_IMAGE=scratch make package-vmagent
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ var (
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
@@ -205,3 +206,15 @@ var (
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
)
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -162,7 +162,7 @@ func getTLSConfig(argIdx int) (*tls.Config, error) {
|
||||
}
|
||||
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot populate TLS config: %s", err)
|
||||
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
|
||||
}
|
||||
tlsCfg := cfg.NewTLSConfig()
|
||||
return tlsCfg, nil
|
||||
|
||||
@@ -33,7 +33,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
rcs.global = global
|
||||
}
|
||||
@@ -45,7 +45,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
for i, path := range *relabelConfigPaths {
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
|
||||
}
|
||||
rcs.perURL[i] = prc
|
||||
}
|
||||
|
||||
@@ -55,11 +55,13 @@ test-vmalert:
|
||||
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
||||
go test -v -race -cover ./app/vmalert/datasource
|
||||
go test -v -race -cover ./app/vmalert/notifier
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093 \
|
||||
-notifier.url=http://127.0.0.1:9093 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-evaluationInterval=3s
|
||||
|
||||
@@ -1,19 +1,27 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
`vmalert` executes a list of given MetricsQL expressions (rules) and
|
||||
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
expressions validation;
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### TODO:
|
||||
* Support recording rules.
|
||||
### Limitations:
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
|
||||
may fail;
|
||||
* by default, rules execution is sequential within one group, but persisting of execution results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one;
|
||||
* there is no `query` function support in templates yet;
|
||||
* `vmalert` has no UI, just an API for getting groups and rules statuses.
|
||||
|
||||
### QuickStart
|
||||
|
||||
@@ -26,10 +34,12 @@ make vmalert
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of alert rules - PromQL/MetricsQL expressions to execute;
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable Alertmanager instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
@@ -38,22 +48,106 @@ Then configure `vmalert` accordingly:
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
```
|
||||
|
||||
`vmalert` runs evaluation for every group in a separate goroutine.
|
||||
Rules in group evaluated one-by-one sequentially.
|
||||
#### Groups
|
||||
|
||||
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
Each group has following attributes:
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# How many rules execute at once. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
||||
#### Rules
|
||||
|
||||
There are two types of Rules:
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allow you to precompute frequently needed or computationally expensive expressions
|
||||
and save their result as a new set of time series.
|
||||
|
||||
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
|
||||
within one group.
|
||||
|
||||
##### Alerting rules
|
||||
|
||||
The syntax for alerting rule is following:
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Alerts are considered firing once they have been returned for this long.
|
||||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
labels:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
|
||||
# Annotations to add to each alert.
|
||||
annotations:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
|
||||
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
|
||||
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
|
||||
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
|
||||
address in form of timeseries with name `ALERTS` via remote-write protocol.
|
||||
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
|
||||
address by querying `ALERTS` timeseries.
|
||||
|
||||
|
||||
##### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
```yaml
|
||||
# The name of the time series to output to. Must be a valid metric name.
|
||||
record: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Labels to add or overwrite before storing the result.
|
||||
labels:
|
||||
[ <labelname>: <labelvalue> ]
|
||||
```
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must specified.
|
||||
|
||||
|
||||
#### WEB
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
|
||||
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
|
||||
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
|
||||
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -64,22 +158,36 @@ Usage of vmalert:
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
|
||||
-datasource.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
|
||||
-datasource.tlsServerName value
|
||||
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules. Default 1m (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
|
||||
-notifier.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
|
||||
-notifier.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
-notifier.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
|
||||
-notifier.tlsServerName value
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteRead.basicAuth.password string
|
||||
@@ -88,14 +196,40 @@ Usage of vmalert:
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteRead.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
|
||||
-remoteRead.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.maxQueueSize
|
||||
Defines the max number of pending datapoints to remote write endpoint
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
|
||||
-remoteWrite.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
@@ -105,8 +239,10 @@ Usage of vmalert:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Indicates to validate annotation and label templates (default true)
|
||||
Whether to validate annotation and label templates (default true)
|
||||
```
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
|
||||
374
app/vmalert/alerting.go
Normal file
374
app/vmalert/alerting.go
Normal file
@@ -0,0 +1,374 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For,
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: gID,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
}
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (ar *AlertingRule) String() string {
|
||||
return ar.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := q.Query(ctx, ar.Expr)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
ar.lastExecError = err
|
||||
ar.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(ar.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = ar.template(a)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ar.lastExecTime)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
if a.State == notifier.StatePending {
|
||||
// alert was in Pending state - it is not
|
||||
// active anymore
|
||||
delete(ar.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
if series {
|
||||
return ar.toTimeSeries(ar.lastExecTime), nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
ts := ar.alertToTimeSeries(a, timestamp)
|
||||
tss = append(tss, ts...)
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, ar.template(a)
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(ar.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(ar.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIAlertingRule
|
||||
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
||||
var lastErr string
|
||||
if ar.lastExecError != nil {
|
||||
lastErr = ar.lastExecError.Error()
|
||||
}
|
||||
return APIAlertingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
Name: ar.Name,
|
||||
Expression: ar.Expr,
|
||||
For: ar.For.String(),
|
||||
LastError: lastErr,
|
||||
LastExec: ar.lastExecTime,
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
}
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
|
||||
if ar.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Exec
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := ar.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -11,30 +10,15 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test{"}).Validate(); err == nil {
|
||||
t.Errorf("exptected invalid expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
alert *notifier.Alert
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
newTestRule("instant", 0),
|
||||
newTestAlertingRule("instant", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -45,7 +29,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant extra labels", 0),
|
||||
newTestAlertingRule("instant extra labels", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
@@ -61,7 +45,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant labels override", 0),
|
||||
newTestAlertingRule("instant labels override", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
alertStateLabel: "foo",
|
||||
"__name__": "bar",
|
||||
@@ -75,7 +59,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for", time.Second),
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -90,7 +74,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for pending", 10*time.Second),
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -107,58 +91,29 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
tss := tc.rule.AlertToTimeSeries(tc.alert, timestamp)
|
||||
if len(tc.expTS) != len(tss) {
|
||||
t.Fatalf("expected number of timeseries %d; got %d", len(tc.expTS), len(tss))
|
||||
}
|
||||
for i := range tc.expTS {
|
||||
expTS, gotTS := tc.expTS[i], tss[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
t.Fatalf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
if got.Timestamp != exp.Timestamp {
|
||||
t.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
t.Fatalf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
t.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
tc.rule.alerts[tc.alert.ID] = tc.alert
|
||||
tss := tc.rule.toTimeSeries(timestamp)
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRule(name string, waitFor time.Duration) *Rule {
|
||||
return &Rule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
|
||||
func TestRule_Exec(t *testing.T) {
|
||||
func TestAlertingRule_Exec(t *testing.T) {
|
||||
const defaultStep = 5 * time.Millisecond
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
steps [][]datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestRule("empty", 0),
|
||||
newTestAlertingRule("empty", 0),
|
||||
[][]datasource.Metric{},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("empty labels", 0),
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[][]datasource.Metric{
|
||||
{datasource.Metric{}},
|
||||
},
|
||||
@@ -167,7 +122,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing", 0),
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -176,7 +131,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -186,7 +141,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -197,7 +152,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -209,7 +164,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -220,7 +175,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -234,7 +189,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-firing", 0),
|
||||
newTestAlertingRule("multiple-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
metricWithLabels(t, "name", "foo"),
|
||||
@@ -249,7 +204,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-steps-firing", 0),
|
||||
newTestAlertingRule("multiple-steps-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo1")},
|
||||
@@ -264,7 +219,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("duplicate", 0),
|
||||
newTestAlertingRule("duplicate", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
// metrics with the same labelset should result in one alert
|
||||
@@ -277,7 +232,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending", time.Minute),
|
||||
newTestAlertingRule("for-pending", time.Minute),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -286,7 +241,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-fired", time.Millisecond),
|
||||
newTestAlertingRule("for-fired", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -296,7 +251,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>empty", time.Second),
|
||||
newTestAlertingRule("for-pending=>empty", time.Second),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -306,7 +261,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -318,7 +273,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -331,7 +286,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -349,15 +304,15 @@ func TestRule_Exec(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if err := tc.rule.Exec(context.TODO(), fq); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
time.Sleep(time.Millisecond)
|
||||
time.Sleep(defaultStep)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
|
||||
@@ -375,49 +330,9 @@ func TestRule_Exec(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
cpy := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cpy, fq.metrics)
|
||||
fq.Unlock()
|
||||
return cpy, nil
|
||||
}
|
||||
|
||||
func TestRule_Restore(t *testing.T) {
|
||||
func TestAlertingRule_Restore(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
metrics []datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
@@ -502,7 +417,7 @@ func TestRule_Restore(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
@@ -526,8 +441,8 @@ func TestRule_Restore(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
r := newTestRule(name, 0)
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
r.Labels[labels[i]] = labels[i+1]
|
||||
@@ -535,9 +450,6 @@ func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
return r
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
@@ -1,74 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file patther %s:%v", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
groupsNames := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("file %s: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if _, ok := groupsNames[g.Name]; ok {
|
||||
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", g.Name, file)
|
||||
}
|
||||
g.File = file
|
||||
g.doneCh = make(chan struct{})
|
||||
g.finishedCh = make(chan struct{})
|
||||
g.updateCh = make(chan Group)
|
||||
|
||||
groupsNames[g.Name] = struct{}{}
|
||||
for _, rule := range g.Rules {
|
||||
if err = rule.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid rule filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
if validateAnnotations {
|
||||
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
|
||||
return nil, fmt.Errorf("invalid annotations filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
|
||||
return nil, fmt.Errorf("invalid labels filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
}
|
||||
rule.group = g
|
||||
rule.alerts = make(map[uint64]*notifier.Alert)
|
||||
}
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
return g.Groups, err
|
||||
}
|
||||
195
app/vmalert/config/config.go
Normal file
195
app/vmalert/config/config.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// Group contains list of Rules grouped into
|
||||
// entity with one name and evaluation interval
|
||||
type Group struct {
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// Validate check for internal Group or Rule configuration errors
|
||||
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("group name must be set")
|
||||
}
|
||||
if len(g.Rules) == 0 {
|
||||
return fmt.Errorf("group %q can't contain no rules", g.Name)
|
||||
}
|
||||
uniqueRules := map[uint64]struct{}{}
|
||||
for _, r := range g.Rules {
|
||||
ruleName := r.Record
|
||||
if r.Alert != "" {
|
||||
ruleName = r.Alert
|
||||
}
|
||||
if _, ok := uniqueRules[r.ID]; ok {
|
||||
return fmt.Errorf("rule %q duplicate", ruleName)
|
||||
}
|
||||
uniqueRules[r.ID] = struct{}{}
|
||||
if err := r.Validate(); err != nil {
|
||||
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if validateExpressions {
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
if validateAnnotations {
|
||||
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
|
||||
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if err := notifier.ValidateTemplates(r.Labels); err != nil {
|
||||
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
|
||||
}
|
||||
|
||||
// Rule describes entity that represent either
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
ID uint64
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type rule Rule
|
||||
if err := unmarshal((*rule)(r)); err != nil {
|
||||
return err
|
||||
}
|
||||
r.ID = HashRule(*r)
|
||||
return nil
|
||||
}
|
||||
|
||||
// HashRule hashes significant Rule fields into
|
||||
// unique hash value
|
||||
func HashRule(r Rule) uint64 {
|
||||
h := fnv.New64a()
|
||||
h.Write([]byte(r.Expr))
|
||||
if r.Record != "" {
|
||||
h.Write([]byte("recording"))
|
||||
h.Write([]byte(r.Record))
|
||||
} else {
|
||||
h.Write([]byte("alerting"))
|
||||
h.Write([]byte(r.Alert))
|
||||
}
|
||||
type item struct {
|
||||
key, value string
|
||||
}
|
||||
var kv []item
|
||||
for k, v := range r.Labels {
|
||||
kv = append(kv, item{key: k, value: v})
|
||||
}
|
||||
sort.Slice(kv, func(i, j int) bool {
|
||||
return kv[i].key < kv[j].key
|
||||
})
|
||||
for _, i := range kv {
|
||||
h.Write([]byte(i.key))
|
||||
h.Write([]byte(i.value))
|
||||
h.Write([]byte("\xff"))
|
||||
}
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// Validate check for Rule configuration errors
|
||||
func (r *Rule) Validate() error {
|
||||
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
|
||||
return fmt.Errorf("either `record` or `alert` must be set")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression can't be empty")
|
||||
}
|
||||
return checkOverflow(r.XXX, "rule")
|
||||
}
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
uniqueGroups := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
|
||||
return nil, fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err)
|
||||
}
|
||||
if _, ok := uniqueGroups[g.Name]; ok {
|
||||
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
|
||||
}
|
||||
uniqueGroups[g.Name] = struct{}{}
|
||||
g.File = file
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return g.Groups, checkOverflow(g.XXX, "config")
|
||||
}
|
||||
|
||||
func checkOverflow(m map[string]interface{}, ctx string) error {
|
||||
if len(m) > 0 {
|
||||
var keys []string
|
||||
for k := range m {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
326
app/vmalert/config/config_test.go
Normal file
326
app/vmalert/config/config_test.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
testCases := []struct {
|
||||
path []string
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
[]string{"testdata/rules0-bad.rules"},
|
||||
"unexpected token",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules0-bad.rules"},
|
||||
"error parsing annotation",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules1-bad.rules"},
|
||||
"duplicate in file",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules2-bad.rules"},
|
||||
"function \"value\" not defined",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules3-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules4-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/*.yaml"},
|
||||
"no groups found",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
_, err := Parse(tc.path, true, true)
|
||||
if err == nil {
|
||||
t.Errorf("expected to get error")
|
||||
return
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule; got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroup_Validate(t *testing.T) {
|
||||
testCases := []struct {
|
||||
group *Group
|
||||
rules []Rule
|
||||
validateAnnotations bool
|
||||
validateExpressions bool
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
group: &Group{},
|
||||
expErr: "group name must be set",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test"},
|
||||
expErr: "contain no rules",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "invalid expression",
|
||||
validateExpressions: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "error parsing annotation",
|
||||
validateAnnotations: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"description": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
|
||||
if err == nil {
|
||||
if tc.expErr != "" {
|
||||
t.Errorf("expected to get err %q; got nil insted", tc.expErr)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHashRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
a, b Rule
|
||||
equal bool
|
||||
}{
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "bar",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: time.Minute},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 2"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "baz",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
}
|
||||
for i, tc := range testCases {
|
||||
aID, bID := HashRule(tc.a), HashRule(tc.b)
|
||||
if tc.equal != (aID == bID) {
|
||||
t.Fatalf("missmatch for rule %d", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,5 +9,3 @@ groups:
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
|
||||
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
record: record
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
expr: vm_rows > 0
|
||||
- record: rows
|
||||
expr: sum(vm_rows)
|
||||
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
File diff suppressed because it is too large
Load Diff
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
groups:
|
||||
- name: TestGroup
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
for: 3m
|
||||
annotations:
|
||||
summary: "Too high connection number for {{$labels.instance}}"
|
||||
description: "It is {{ $value }} connections for {{$labels.instance}}"
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job)
|
||||
(up == 1)
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
labels:
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: dev
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: staging
|
||||
recording: true
|
||||
- record: successful_requests:ratio_rate5m
|
||||
labels:
|
||||
recording: true
|
||||
expr: |2
|
||||
sum(code:requests:rate5m{code="200"})
|
||||
/
|
||||
sum(code:requests:rate5m)
|
||||
@@ -1,39 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected syntaxt error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template annotation error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules1-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected same group error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules2-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template label error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/*.yaml"}, true); err == nil {
|
||||
t.Errorf("expected empty group")
|
||||
}
|
||||
}
|
||||
38
app/vmalert/datasource/init.go
Normal file
38
app/vmalert/datasource/init.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
|
||||
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
|
||||
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
|
||||
"By default the server name from -datasource.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
func Init() (Querier, error) {
|
||||
if *addr == "" {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("datasource.url is empty")
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
@@ -32,7 +32,7 @@ func (r response) metrics() ([]Metric, error) {
|
||||
for i, res := range r.Data.Result {
|
||||
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
|
||||
}
|
||||
m.Labels = nil
|
||||
for k, v := range r.Data.Result[i].Labels {
|
||||
@@ -49,9 +49,10 @@ const queryPath = "/api/v1/query?query="
|
||||
|
||||
// VMStorage represents vmstorage entity with ability to read and write metrics
|
||||
type VMStorage struct {
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser, basicAuthPass string
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
}
|
||||
|
||||
// NewVMStorage is a constructor for VMStorage
|
||||
@@ -79,25 +80,25 @@ func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
|
||||
}
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s with err %w. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
}
|
||||
r := &response{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error parsing metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
|
||||
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
}
|
||||
if r.Data.ResultType != rtVector {
|
||||
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
return nil, fmt.Errorf("unknown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
}
|
||||
return r.metrics()
|
||||
}
|
||||
|
||||
@@ -4,26 +4,63 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
Name string
|
||||
File string
|
||||
Rules []*Rule
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
|
||||
doneCh chan struct{}
|
||||
finishedCh chan struct{}
|
||||
// channel accepts new Group obj
|
||||
// which supposed to update current group
|
||||
updateCh chan Group
|
||||
updateCh chan *Group
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
g := &Group{
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval,
|
||||
Concurrency: cfg.Concurrency,
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
if g.Interval == 0 {
|
||||
g.Interval = defaultInterval
|
||||
}
|
||||
if g.Concurrency < 1 {
|
||||
g.Concurrency = 1
|
||||
}
|
||||
rules := make([]Rule, len(cfg.Rules))
|
||||
for i, r := range cfg.Rules {
|
||||
rules[i] = g.newRule(r)
|
||||
}
|
||||
g.Rules = rules
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(g.ID(), rule)
|
||||
}
|
||||
return newRecordingRule(g.ID(), rule)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
@@ -36,48 +73,49 @@ func (g *Group) ID() uint64 {
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for all group rules with For > 0
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
if rule.For == 0 {
|
||||
return nil
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if err := rule.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateWith updates existing group with
|
||||
// passed group object.
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup Group) {
|
||||
rulesRegistry := make(map[string]*Rule)
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
for _, nr := range newGroup.Rules {
|
||||
rulesRegistry[nr.id()] = nr
|
||||
rulesRegistry[nr.ID()] = nr
|
||||
}
|
||||
|
||||
for i, or := range g.Rules {
|
||||
nr, ok := rulesRegistry[or.id()]
|
||||
nr, ok := rulesRegistry[or.ID()]
|
||||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// so we mark it for removing
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
|
||||
// copy all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
or.For = nr.For
|
||||
or.Expr = nr.Expr
|
||||
or.Labels = nr.Labels
|
||||
or.Annotations = nr.Annotations
|
||||
delete(rulesRegistry, nr.id())
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
}
|
||||
|
||||
var newRules []*Rule
|
||||
var newRules []Rule
|
||||
for _, r := range g.Rules {
|
||||
if r == nil {
|
||||
// skip nil rules
|
||||
@@ -89,7 +127,9 @@ func (g *Group) updateWith(newGroup Group) {
|
||||
for _, nr := range rulesRegistry {
|
||||
newRules = append(newRules, nr)
|
||||
}
|
||||
g.Concurrency = newGroup.Concurrency
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -116,82 +156,145 @@ func (g *Group) close() {
|
||||
<-g.finishedCh
|
||||
}
|
||||
|
||||
func (g *Group) start(ctx context.Context, interval time.Duration,
|
||||
querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
|
||||
logger.Infof("group %q started", g.Name)
|
||||
t := time.NewTicker(interval)
|
||||
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{querier, nts, rw}
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Infof("group %q: context cancelled", g.Name)
|
||||
close(g.finishedCh)
|
||||
return
|
||||
case <-g.doneCh:
|
||||
logger.Infof("group %q: received stop signal", g.Name)
|
||||
close(g.finishedCh)
|
||||
return
|
||||
case ng := <-g.updateCh:
|
||||
g.updateWith(ng)
|
||||
g.mu.Lock()
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||
g.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
if g.Interval != ng.Interval {
|
||||
g.Interval = ng.Interval
|
||||
t.Stop()
|
||||
t = time.NewTicker(g.Interval)
|
||||
}
|
||||
g.mu.Unlock()
|
||||
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
case <-t.C:
|
||||
iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
for _, rule := range g.Rules {
|
||||
execTotal.Inc()
|
||||
|
||||
execStart := time.Now()
|
||||
err := rule.Exec(ctx, querier)
|
||||
execDuration.UpdateDuration(execStart)
|
||||
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
logger.Errorf("failed to execute rule %q.%q: %s", g.Name, rule.Name, err)
|
||||
continue
|
||||
}
|
||||
|
||||
var alertsToSend []notifier.Alert
|
||||
for _, a := range rule.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
// set End to execStart + 3 intervals
|
||||
// so notifier can resolve it automatically if `vmalert`
|
||||
// won't be able to send resolve for some reason
|
||||
a.End = execStart.Add(3 * interval)
|
||||
alertsToSend = append(alertsToSend, *a)
|
||||
pushToRW(rw, rule, a, execStart)
|
||||
case notifier.StatePending:
|
||||
pushToRW(rw, rule, a, execStart)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = execStart
|
||||
alertsToSend = append(alertsToSend, *a)
|
||||
}
|
||||
}
|
||||
if len(alertsToSend) == 0 {
|
||||
continue
|
||||
}
|
||||
alertsSent.Add(len(alertsToSend))
|
||||
if err := nr.Send(ctx, alertsToSend); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
iterationDuration.UpdateDuration(iterationStart)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
|
||||
if rw == nil {
|
||||
return
|
||||
type executor struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
var returnSeries bool
|
||||
if e.rw != nil {
|
||||
returnSeries = true
|
||||
}
|
||||
tss := rule.AlertToTimeSeries(a, timestamp)
|
||||
remoteWriteSent.Add(len(tss))
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
||||
|
||||
if concurrency == 1 {
|
||||
// fast path
|
||||
for _, rule := range rules {
|
||||
res <- e.exec(ctx, rule, returnSeries, interval)
|
||||
}
|
||||
close(res)
|
||||
return res
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, rule := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, returnSeries, interval)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
}()
|
||||
return res
|
||||
}
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
|
||||
execTotal.Inc()
|
||||
execStart := time.Now()
|
||||
defer func() {
|
||||
execDuration.UpdateDuration(execStart)
|
||||
}()
|
||||
|
||||
tss, err := rule.Exec(ctx, e.querier, returnSeries)
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
}
|
||||
|
||||
if len(tss) > 0 && e.rw != nil {
|
||||
remoteWriteSent.Add(len(tss))
|
||||
for _, ts := range tss {
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
// set End to execStart + 3 intervals
|
||||
// so notifier can resolve it automatically if `vmalert`
|
||||
// won't be able to send resolve for some reason
|
||||
a.End = time.Now().Add(3 * interval)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = time.Now()
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
alertsSent.Add(len(alerts))
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers {
|
||||
if err := nt.Send(ctx, alerts); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||
}
|
||||
}
|
||||
return errGr.Err()
|
||||
}
|
||||
|
||||
@@ -2,33 +2,31 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
currentRules []*Rule
|
||||
// rules must be sorted by name
|
||||
newRules []*Rule
|
||||
currentRules []config.Rule
|
||||
newRules []config.Rule
|
||||
}{
|
||||
{
|
||||
"new rule",
|
||||
[]*Rule{},
|
||||
[]*Rule{{Name: "bar"}},
|
||||
nil,
|
||||
[]config.Rule{{Alert: "bar"}},
|
||||
},
|
||||
{
|
||||
"update rule",
|
||||
[]*Rule{{
|
||||
Name: "foo",
|
||||
Expr: "up > 0",
|
||||
For: time.Second,
|
||||
"update alerting rule",
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
@@ -37,10 +35,10 @@ func TestUpdateWith(t *testing.T) {
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}},
|
||||
[]*Rule{{
|
||||
Name: "bar",
|
||||
Expr: "up > 10",
|
||||
For: time.Second,
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
@@ -49,56 +47,96 @@ func TestUpdateWith(t *testing.T) {
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "max(up)",
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "min(up)",
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"empty rule",
|
||||
[]*Rule{{Name: "foo"}},
|
||||
[]*Rule{},
|
||||
[]config.Rule{{Alert: "foo"}, {Record: "bar"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"multiple rules",
|
||||
[]*Rule{{Name: "bar"}, {Name: "baz"}, {Name: "foo"}},
|
||||
[]*Rule{{Name: "baz"}, {Name: "foo"}},
|
||||
[]config.Rule{
|
||||
{Alert: "bar"},
|
||||
{Alert: "baz"},
|
||||
{Alert: "foo"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "baz"},
|
||||
{Record: "foo"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"replace rule",
|
||||
[]*Rule{{Name: "foo1"}},
|
||||
[]*Rule{{Name: "foo2"}},
|
||||
[]config.Rule{{Alert: "foo1"}},
|
||||
[]config.Rule{{Alert: "foo2"}},
|
||||
},
|
||||
{
|
||||
"replace multiple rules",
|
||||
[]*Rule{{Name: "foo1"}, {Name: "foo2"}},
|
||||
[]*Rule{{Name: "foo3"}, {Name: "foo4"}},
|
||||
[]config.Rule{
|
||||
{Alert: "foo1"},
|
||||
{Record: "foo2"},
|
||||
{Alert: "foo3"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "foo3"},
|
||||
{Alert: "foo4"},
|
||||
{Record: "foo5"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Rules: tc.currentRules}
|
||||
g.updateWith(Group{Rules: tc.newRules})
|
||||
g := &Group{Name: "test"}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(r))
|
||||
}
|
||||
|
||||
ng := &Group{Name: "test"}
|
||||
for _, r := range tc.newRules {
|
||||
r.ID = config.HashRule(r)
|
||||
ng.Rules = append(ng.Rules, ng.newRule(r))
|
||||
}
|
||||
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(g.Rules) != len(tc.newRules) {
|
||||
t.Fatalf("expected to have %d rules; got: %d",
|
||||
len(g.Rules), len(tc.newRules))
|
||||
}
|
||||
sort.Slice(g.Rules, func(i, j int) bool {
|
||||
return g.Rules[i].Name < g.Rules[j].Name
|
||||
return g.Rules[i].ID() < g.Rules[j].ID()
|
||||
})
|
||||
sort.Slice(ng.Rules, func(i, j int) bool {
|
||||
return ng.Rules[i].ID() < ng.Rules[j].ID()
|
||||
})
|
||||
for i, r := range g.Rules {
|
||||
got, want := r, tc.newRules[i]
|
||||
if got.Name != want.Name {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
|
||||
got, want := r, ng.Rules[i]
|
||||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if got.Expr != want.Expr {
|
||||
t.Fatalf("expected to have expression %q; got %q", want.Expr, got.Expr)
|
||||
}
|
||||
if got.For != want.For {
|
||||
t.Fatalf("expected to have for %q; got %q", want.For, got.For)
|
||||
}
|
||||
if !reflect.DeepEqual(got.Annotations, want.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", want.Annotations, got.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(got.Labels, want.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", want.Labels, got.Labels)
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -107,11 +145,13 @@ func TestUpdateWith(t *testing.T) {
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := Parse([]string{"testdata/rules1-good.rules"}, true)
|
||||
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true, true)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
g := groups[0]
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval)
|
||||
g.Concurrency = 2
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fs := &fakeQuerier{}
|
||||
@@ -120,27 +160,26 @@ func TestGroupStart(t *testing.T) {
|
||||
m1 := metricWithLabels(t, "instance", inst1, "job", job)
|
||||
m2 := metricWithLabels(t, "instance", inst2, "job", job)
|
||||
|
||||
r := g.Rules[0]
|
||||
alert1, err := r.newAlert(m1)
|
||||
r := g.Rules[0].(*AlertingRule)
|
||||
alert1, err := r.newAlert(m1, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert1.State = notifier.StateFiring
|
||||
alert1.ID = hash(m1)
|
||||
|
||||
alert2, err := r.newAlert(m2)
|
||||
alert2, err := r.newAlert(m2, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert2.State = notifier.StateFiring
|
||||
alert2.ID = hash(m2)
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), evalInterval, fs, fn, nil)
|
||||
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
@@ -166,52 +205,3 @@ func TestGroupStart(t *testing.T) {
|
||||
g.close()
|
||||
<-finished
|
||||
}
|
||||
|
||||
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
|
||||
t.Helper()
|
||||
if len(as) != len(bs) {
|
||||
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
|
||||
}
|
||||
sort.Slice(as, func(i, j int) bool {
|
||||
return as[i].ID < as[j].ID
|
||||
})
|
||||
sort.Slice(bs, func(i, j int) bool {
|
||||
return bs[i].ID < bs[j].ID
|
||||
})
|
||||
for i := range as {
|
||||
a, b := as[i], bs[i]
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.State != b.State {
|
||||
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
|
||||
}
|
||||
if a.Value != b.Value {
|
||||
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
232
app/vmalert/helpers_test.go
Normal file
232
app/vmalert/helpers_test.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
err error
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) setErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return nil, fq.err
|
||||
}
|
||||
cp := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
return cp, nil
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := compareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareRules(t *testing.T, a, b Rule) error {
|
||||
t.Helper()
|
||||
switch v := a.(type) {
|
||||
case *AlertingRule:
|
||||
br, ok := b.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
|
||||
}
|
||||
return compareAlertingRules(t, v, br)
|
||||
case *RecordingRule:
|
||||
br, ok := b.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
|
||||
}
|
||||
return compareRecordingRules(t, v, br)
|
||||
default:
|
||||
return fmt.Errorf("unexpected rule type received %T", a)
|
||||
}
|
||||
}
|
||||
|
||||
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if a.For != b.For {
|
||||
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
|
||||
t.Helper()
|
||||
if len(a) != len(b) {
|
||||
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
|
||||
}
|
||||
for i := range a {
|
||||
expTS, gotTS := a[i], b[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
// timestamp validation isn't always correct for now.
|
||||
// this must be improved with time mock.
|
||||
/*if got.Timestamp != exp.Timestamp {
|
||||
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}*/
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
|
||||
t.Helper()
|
||||
if len(as) != len(bs) {
|
||||
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
|
||||
}
|
||||
sort.Slice(as, func(i, j int) bool {
|
||||
return as[i].ID < as[j].ID
|
||||
})
|
||||
sort.Slice(bs, func(i, j int) bool {
|
||||
return bs[i].ID < bs[j].ID
|
||||
})
|
||||
for i := range as {
|
||||
a, b := as[i], bs[i]
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.State != b.State {
|
||||
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
|
||||
}
|
||||
if a.Value != b.Value {
|
||||
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,14 +4,15 @@ import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
@@ -30,72 +31,34 @@ Examples:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.`)
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
|
||||
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
|
||||
remoteWriteURL = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
remoteWriteUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
remoteWritePassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
remoteWriteMaxQueueSize = flag.Int("remoteWrite.maxQueueSize", 10e3, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
|
||||
remoteReadURL = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
remoteReadUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
remoteReadPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
|
||||
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
checkFlags()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
manager, err := newManager(ctx)
|
||||
if err != nil {
|
||||
logger.Fatalf("can not get external url: %s ", err)
|
||||
logger.Fatalf("failed to init: %s", err)
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
|
||||
notifier: notifier.NewAlertManager(*notifierURL, func(group, alert string) string {
|
||||
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, alert)
|
||||
}, &http.Client{}),
|
||||
}
|
||||
if *remoteWriteURL != "" {
|
||||
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
|
||||
Addr: *remoteWriteURL,
|
||||
MaxQueueSize: *remoteWriteMaxQueueSize,
|
||||
FlushInterval: *evaluationInterval,
|
||||
BasicAuthUser: *remoteWriteUsername,
|
||||
BasicAuthPass: *remoteWritePassword,
|
||||
})
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init remotewrite client: %s", err)
|
||||
}
|
||||
manager.rw = c
|
||||
}
|
||||
if *remoteReadURL != "" {
|
||||
manager.rr = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
|
||||
}
|
||||
|
||||
if err := manager.start(ctx, *rulePath, *validateTemplates); err != nil {
|
||||
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
|
||||
logger.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
@@ -108,7 +71,7 @@ func main() {
|
||||
<-sigHup
|
||||
configReloads.Inc()
|
||||
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
|
||||
if err := manager.update(ctx, *rulePath, *validateTemplates, false); err != nil {
|
||||
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("error while reloading rules: %s", err)
|
||||
@@ -139,6 +102,44 @@ var (
|
||||
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
|
||||
)
|
||||
|
||||
func newManager(ctx context.Context) (*manager, error) {
|
||||
q, err := datasource.Init()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init datasource: %w", err)
|
||||
}
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
|
||||
}
|
||||
nts, err := notifier.Init(aug)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: q,
|
||||
notifiers: nts,
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
|
||||
}
|
||||
manager.rw = rw
|
||||
|
||||
rr, err := remoteread.Init()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
||||
}
|
||||
manager.rr = rr
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
|
||||
if externalURL != "" {
|
||||
return url.Parse(externalURL)
|
||||
@@ -158,13 +159,39 @@ func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL
|
||||
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
|
||||
}
|
||||
|
||||
func checkFlags() {
|
||||
if *notifierURL == "" {
|
||||
flag.PrintDefaults()
|
||||
logger.Fatalf("notifier.url is empty")
|
||||
func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, validateTemplate bool) (notifier.AlertURLGenerator, error) {
|
||||
if externalAlertSource == "" {
|
||||
return func(alert notifier.Alert) string {
|
||||
return fmt.Sprintf("%s/api/v1/%s/%s/status", externalURL, strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10))
|
||||
}, nil
|
||||
}
|
||||
if *datasourceURL == "" {
|
||||
flag.PrintDefaults()
|
||||
logger.Fatalf("datasource.url is empty")
|
||||
if validateTemplate {
|
||||
if err := notifier.ValidateTemplates(map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
|
||||
}
|
||||
}
|
||||
m := map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}
|
||||
return func(alert notifier.Alert) string {
|
||||
templated, err := alert.ExecTemplate(m)
|
||||
if err != nil {
|
||||
logger.Errorf("can not exec source template %s", err)
|
||||
}
|
||||
return fmt.Sprintf("%s/%s", externalURL, templated["tpl"])
|
||||
}, nil
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmalert processes alerts and recording rules.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
53
app/vmalert/main_test.go
Normal file
53
app/vmalert/main_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
expURL := "https://vicotriametrics.com/path"
|
||||
u, err := getExternalURL(expURL, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
h, _ := os.Hostname()
|
||||
expURL = fmt.Sprintf("https://%s:4242", h)
|
||||
u, err = getExternalURL("", "0.0.0.0:4242", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAlertURLGenerator(t *testing.T) {
|
||||
testAlert := notifier.Alert{GroupID: 42, ID: 2, Value: 4}
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
fn, err := getAlertURLGenerator(u, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/api/v1/42/2/status"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
|
||||
if err == nil {
|
||||
t.Errorf("exptected tempalte validation error got nil")
|
||||
}
|
||||
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/foo?query=4"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
}
|
||||
@@ -6,15 +6,17 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
storage datasource.Querier
|
||||
notifier notifier.Notifier
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rr datasource.Querier
|
||||
@@ -25,7 +27,7 @@ type manager struct {
|
||||
groups map[uint64]*Group
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
@@ -35,15 +37,19 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
return nil, fmt.Errorf("can't find group with id %q", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if apiAlert := rule.AlertAPI(aID); apiAlert != nil {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("can't func alert with id %q in group %q", aID, g.Name)
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validate bool) error {
|
||||
return m.update(ctx, path, validate, true)
|
||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
}
|
||||
|
||||
func (m *manager) close() {
|
||||
@@ -56,7 +62,7 @@ func (m *manager) close() {
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
@@ -67,21 +73,22 @@ func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
|
||||
m.wg.Add(1)
|
||||
id := group.ID()
|
||||
go func() {
|
||||
group.start(ctx, *evaluationInterval, m.storage, m.notifier, m.rw)
|
||||
group.start(ctx, m.querier, m.notifiers, m.rw)
|
||||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = &group
|
||||
m.groups[id] = group
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validate, restore bool) error {
|
||||
logger.Infof("reading alert rules configuration file from %q", strings.Join(path, ";"))
|
||||
newGroups, err := Parse(path, validate)
|
||||
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
|
||||
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse configuration file: %s", err)
|
||||
return fmt.Errorf("cannot parse configuration file: %w", err)
|
||||
}
|
||||
|
||||
groupsRegistry := make(map[uint64]Group)
|
||||
for _, ng := range newGroups {
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
@@ -106,3 +113,23 @@ func (m *manager) update(ctx context.Context, path []string, validate, restore b
|
||||
m.groupsMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
ag := APIGroup{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
Name: g.Name,
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
case *AlertingRule:
|
||||
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
||||
case *RecordingRule:
|
||||
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
||||
}
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
@@ -3,16 +3,26 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestManagerUpdateError(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
path := []string{"foo/bar"}
|
||||
err := m.update(context.Background(), path, true, false)
|
||||
err := m.update(context.Background(), path, true, true, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to have err; got nil instead")
|
||||
}
|
||||
@@ -27,17 +37,21 @@ func TestManagerUpdateError(t *testing.T) {
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
storage: &fakeQuerier{},
|
||||
notifier: &fakeNotifier{},
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
}
|
||||
paths := []string{
|
||||
"testdata/dir/rules0-good.rules",
|
||||
"testdata/dir/rules1-good.rules",
|
||||
"testdata/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-bad.rules",
|
||||
"config/testdata/dir/rules1-good.rules",
|
||||
"config/testdata/dir/rules1-bad.rules",
|
||||
"config/testdata/rules0-good.rules",
|
||||
"config/testdata/rules1-good.rules",
|
||||
"config/testdata/rules2-good.rules",
|
||||
}
|
||||
*evaluationInterval = time.Millisecond
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true); err != nil {
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
|
||||
t.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
@@ -51,10 +65,7 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
err := m.update(context.Background(), path, true, false)
|
||||
if err != nil {
|
||||
t.Errorf("update error: %s", err)
|
||||
}
|
||||
_ = m.update(context.Background(), path, true, true, false)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -64,6 +75,41 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
// TestManagerUpdate tests sequential configuration
|
||||
// updates.
|
||||
func TestManagerUpdate(t *testing.T) {
|
||||
const defaultEvalInterval = time.Second * 30
|
||||
currentEvalInterval := *evaluationInterval
|
||||
*evaluationInterval = defaultEvalInterval
|
||||
defer func() {
|
||||
*evaluationInterval = currentEvalInterval
|
||||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
Labels: map[string]string{
|
||||
"label": "bar",
|
||||
"host": "{{ $labels.instance }}",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
"summary": "Too high connection number for {{$labels.instance}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
initPath string
|
||||
@@ -72,49 +118,65 @@ func TestManagerUpdate(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "testdata/rules0-good.rules",
|
||||
updatePath: "testdata/dir/rules1-good.rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Rules: []*Rule{newTestRule("VMRows", time.Second*10)},
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
Labels: map[string]string{"label": "bar"},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "testdata/dir/rules1-good.rules",
|
||||
updatePath: "testdata/rules0-good.rules",
|
||||
initPath: "config/testdata/dir/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules0-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert", Rules: []*Rule{
|
||||
newTestRule("VMRows", time.Second*10),
|
||||
}},
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
},
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "TestGroup", Rules: []*Rule{
|
||||
newTestRule("Conns", time.Duration(0)),
|
||||
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update with one bad rule file",
|
||||
initPath: "testdata/rules0-good.rules",
|
||||
updatePath: "testdata/dir/rules2-bad.rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert", Rules: []*Rule{
|
||||
newTestRule("VMRows", time.Second*10),
|
||||
}},
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "TestGroup", Rules: []*Rule{
|
||||
newTestRule("Conns", time.Duration(0)),
|
||||
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
@@ -122,14 +184,14 @@ func TestManagerUpdate(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{groups: make(map[uint64]*Group), storage: &fakeQuerier{}}
|
||||
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
|
||||
path := []string{tc.initPath}
|
||||
if err := m.update(ctx, path, true, false); err != nil {
|
||||
if err := m.update(ctx, path, true, true, false); err != nil {
|
||||
t.Fatalf("failed to complete initial rules update: %s", err)
|
||||
}
|
||||
|
||||
path = []string{tc.updatePath}
|
||||
_ = m.update(ctx, path, true, false)
|
||||
_ = m.update(ctx, path, true, true, false)
|
||||
if len(tc.want) != len(m.groups) {
|
||||
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
|
||||
}
|
||||
@@ -139,7 +201,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("expected to have group %q", wantG.Name)
|
||||
}
|
||||
compareGroups(t, gotG, wantG)
|
||||
compareGroups(t, wantG, gotG)
|
||||
}
|
||||
|
||||
cancel()
|
||||
@@ -147,17 +209,3 @@ func TestManagerUpdate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if got.Name != want.Name {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
// Alert the triggered alert
|
||||
@@ -77,7 +79,7 @@ func ValidateTemplates(annotations map[string]string) error {
|
||||
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
|
||||
var builder strings.Builder
|
||||
var buf bytes.Buffer
|
||||
eg := errGroup{}
|
||||
eg := new(utils.ErrGroup)
|
||||
r := make(map[string]string, len(annotations))
|
||||
for key, text := range annotations {
|
||||
r[key] = text
|
||||
@@ -87,21 +89,21 @@ func templateAnnotations(annotations map[string]string, header string, data aler
|
||||
builder.WriteString(header)
|
||||
builder.WriteString(text)
|
||||
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
|
||||
eg.errs = append(eg.errs, fmt.Sprintf("key %s, template %s:%s", key, text, err))
|
||||
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
|
||||
continue
|
||||
}
|
||||
r[key] = buf.String()
|
||||
}
|
||||
return r, eg.err()
|
||||
return r, eg.Err()
|
||||
}
|
||||
|
||||
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
|
||||
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing annotation:%w", err)
|
||||
return fmt.Errorf("error parsing annotation: %w", err)
|
||||
}
|
||||
if err = tpl.Execute(dst, data); err != nil {
|
||||
return fmt.Errorf("error evaluating annotation template:%w", err)
|
||||
return fmt.Errorf("error evaluating annotation template: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAlert_ExecTemplate(t *testing.T) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
InitTemplateFunc(u)
|
||||
testCases := []struct {
|
||||
name string
|
||||
alert *Alert
|
||||
|
||||
@@ -12,9 +12,11 @@ import (
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
alertURL string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
alertURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
@@ -28,6 +30,9 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req = req.WithContext(ctx)
|
||||
if am.basicAuthPass != "" {
|
||||
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
|
||||
}
|
||||
resp, err := am.client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -38,7 +43,7 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read response from %q: %s", am.alertURL, err)
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
|
||||
}
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
|
||||
}
|
||||
@@ -46,15 +51,18 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
}
|
||||
|
||||
// AlertURLGenerator returns URL to single alert by given name
|
||||
type AlertURLGenerator func(group, alert string) string
|
||||
type AlertURLGenerator func(Alert) string
|
||||
|
||||
const alertManagerPath = "/api/v2/alerts"
|
||||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
return &AlertManager{
|
||||
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
alertURL: addr,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
basicAuthUser: user,
|
||||
basicAuthPass: pass,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
{% import (
|
||||
"strconv"
|
||||
"time"
|
||||
) %}
|
||||
{% stripspace %}
|
||||
|
||||
{% func amRequest(alerts []Alert, generatorURL func(string, string) string) %}
|
||||
{% func amRequest(alerts []Alert, generatorURL func(Alert) string) %}
|
||||
[
|
||||
{% for i, alert := range alerts %}
|
||||
{
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)) %},
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(alert) %},
|
||||
{% if !alert.End.IsZero() %}
|
||||
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
|
||||
{% endif %}
|
||||
|
||||
@@ -6,126 +6,125 @@ package notifier
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:1
|
||||
import (
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
import (
|
||||
qtio422016 "io"
|
||||
|
||||
qt422016 "github.com/valyala/quicktemplate"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
var (
|
||||
_ = qtio422016.Copy
|
||||
_ = qt422016.AcquireByteBuffer
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
qw422016.N().S(`[`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:9
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
for i, alert := range alerts {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:9
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
qw422016.N().S(`{"startsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().S(`,"generatorURL":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
qw422016.N().Q(generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().Q(generatorURL(alert))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
if !alert.End.IsZero() {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
qw422016.N().S(`"endsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:15
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:15
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
qw422016.N().S(`"labels": {"alertname":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:16
|
||||
qw422016.N().Q(alert.Name)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
for k, v := range alert.Labels {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:20
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:20
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
qw422016.N().S(`},"annotations": {`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:23
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:22
|
||||
c := len(alert.Annotations)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:24
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:23
|
||||
for k, v := range alert.Annotations {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:24
|
||||
c = c - 1
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
if c > 0 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:27
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:27
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
qw422016.N().S(`}}`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
if i != len(alerts)-1 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:31
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:31
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
qw422016.N().S(`]`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
streamamRequest(qw422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseWriter(qw422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
func amRequest(alerts []Alert, generatorURL func(string, string) string) string {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func amRequest(alerts []Alert, generatorURL func(Alert) string) string {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
writeamRequest(qb422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qs422016 := string(qb422016.B)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseByteBuffer(qb422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
return qs422016
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
@@ -5,17 +5,27 @@ import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAlertManager_Send(t *testing.T) {
|
||||
const baUser, baPass = "foo", "bar"
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Errorf("should not be called")
|
||||
})
|
||||
c := -1
|
||||
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
user, pass, ok := r.BasicAuth()
|
||||
if !ok {
|
||||
t.Errorf("unauthorized request")
|
||||
}
|
||||
if user != baUser || pass != baPass {
|
||||
t.Errorf("wrong creds %q:%q; expected %q:%q",
|
||||
user, pass, baUser, baPass)
|
||||
}
|
||||
c++
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("expected POST method got %s", r.Method)
|
||||
@@ -57,8 +67,8 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
am := NewAlertManager(srv.URL, func(group, name string) string {
|
||||
return group + "/" + name
|
||||
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
|
||||
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
|
||||
}, srv.Client())
|
||||
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
|
||||
t.Error("expected connection error got nil")
|
||||
|
||||
47
app/vmalert/notifier/init.go
Normal file
47
app/vmalert/notifier/init.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
)
|
||||
|
||||
var (
|
||||
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
|
||||
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
|
||||
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
|
||||
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
|
||||
"By default the server name from -notifier.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Notifier object based on provided flags.
|
||||
func Init(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
if len(*addrs) == 0 {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
|
||||
}
|
||||
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
|
||||
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
|
||||
tr, err := utils.Transport(addr, cert, key, ca, serverName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
|
||||
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
|
||||
notifiers = append(notifiers, am)
|
||||
}
|
||||
|
||||
return notifiers, nil
|
||||
}
|
||||
13
app/vmalert/notifier/package_test.go
Normal file
13
app/vmalert/notifier/package_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type errGroup struct {
|
||||
errs []string
|
||||
}
|
||||
|
||||
func (eg *errGroup) err() error {
|
||||
if eg == nil || len(eg.errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return eg
|
||||
}
|
||||
|
||||
func (eg *errGroup) Error() string {
|
||||
return fmt.Sprintf("errors:%s", strings.Join(eg.errs, "\n"))
|
||||
}
|
||||
149
app/vmalert/recording.go
Normal file
149
app/vmalert/recording.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// RecordingRule is a Rule that supposed
|
||||
// to evaluate configured Expression and
|
||||
// return TimeSeries as result.
|
||||
type RecordingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
Labels map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (rr *RecordingRule) String() string {
|
||||
return rr.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (rr *RecordingRule) ID() uint64 {
|
||||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
|
||||
return &RecordingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Record,
|
||||
Expr: cfg.Expr,
|
||||
Labels: cfg.Labels,
|
||||
GroupID: gID,
|
||||
}
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
if !series {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
qMetrics, err := q.Query(ctx, rr.Expr)
|
||||
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
rr.lastExecTime = time.Now()
|
||||
rr.lastExecError = err
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
}
|
||||
|
||||
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, r := range qMetrics {
|
||||
ts := rr.toTimeSeries(r, rr.lastExecTime)
|
||||
h := hashTimeSeries(ts)
|
||||
if _, ok := duplicates[h]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, errDuplicate
|
||||
}
|
||||
duplicates[h] = ts
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := ts.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
labels[l.Name] = l.Value
|
||||
}
|
||||
labels["__name__"] = rr.Name
|
||||
// override existing labels with configured ones
|
||||
for k, v := range rr.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Value, labels, timestamp)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
return nil
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIRecordingRule
|
||||
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
||||
var lastErr string
|
||||
if rr.lastExecError != nil {
|
||||
lastErr = rr.lastExecError.Error()
|
||||
}
|
||||
return APIRecordingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
Name: rr.Name,
|
||||
Expression: rr.Expr,
|
||||
LastError: lastErr,
|
||||
LastExec: rr.lastExecTime,
|
||||
Labels: rr.Labels,
|
||||
}
|
||||
}
|
||||
121
app/vmalert/recording_test.go
Normal file
121
app/vmalert/recording_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
metrics []datasource.Metric
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
[]datasource.Metric{metricWithValueAndLabels(t, 10,
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(10, map[string]string{
|
||||
"__name__": "foo",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
|
||||
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}, timestamp),
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}, timestamp),
|
||||
newTimeSeries(3, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tss, err := tc.rule.Exec(context.TODO(), fq, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
|
||||
_, err := rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), errDuplicate.Error()) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
|
||||
}
|
||||
}
|
||||
39
app/vmalert/remoteread/init.go
Normal file
39
app/vmalert/remoteread/init.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package remoteread
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
|
||||
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
|
||||
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
|
||||
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
|
||||
"By default the server name from -remoteRead.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init() (datasource.Querier, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
54
app/vmalert/remotewrite/init.go
Normal file
54
app/vmalert/remotewrite/init.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
|
||||
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
|
||||
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
|
||||
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
|
||||
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used")
|
||||
)
|
||||
|
||||
// Init creates Client object from given flags.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init(ctx context.Context) (*Client, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
|
||||
return NewClient(ctx, Config{
|
||||
Addr: *addr,
|
||||
Concurrency: *concurrency,
|
||||
MaxQueueSize: *maxQueueSize,
|
||||
MaxBatchSize: *maxBatchSize,
|
||||
FlushInterval: *flushInterval,
|
||||
BasicAuthUser: *basicAuthUsername,
|
||||
BasicAuthPass: *basicAuthPassword,
|
||||
Transport: t,
|
||||
})
|
||||
}
|
||||
@@ -38,23 +38,30 @@ type Config struct {
|
||||
BasicAuthUser string
|
||||
BasicAuthPass string
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// WriteTimeout defines timeout for HTTP write request
|
||||
// to remote storage
|
||||
WriteTimeout time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 100
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
@@ -80,7 +87,8 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: cfg.WriteTimeout,
|
||||
Timeout: cfg.WriteTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
|
||||
baUser: cfg.BasicAuthUser,
|
||||
@@ -90,7 +98,13 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
c.run(ctx)
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
@@ -128,7 +142,10 @@ func (c *Client) run(ctx context.Context) {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
|
||||
@@ -2,339 +2,23 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// Rule is basic alert entity
|
||||
type Rule struct {
|
||||
Name string `yaml:"alert"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels"`
|
||||
Annotations map[string]string `yaml:"annotations"`
|
||||
|
||||
group Group
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func (r *Rule) id() string {
|
||||
return r.Name
|
||||
}
|
||||
|
||||
// Validate validates rule
|
||||
func (r *Rule) Validate() error {
|
||||
if r.Name == "" {
|
||||
return errors.New("rule name can not be empty")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
|
||||
}
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Exec executes Rule expression via the given Querier.
|
||||
// Based on the Querier results Rule maintains notifier.Alerts
|
||||
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
||||
qMetrics, err := q.Query(ctx, r.Expr)
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
r.lastExecError = err
|
||||
r.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(r.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := r.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = r.template(a)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
r.lastExecError = err
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
r.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
if a.State == notifier.StatePending {
|
||||
// alert was in Pending state - it is not
|
||||
// active anymore
|
||||
delete(r.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: r.group.ID(),
|
||||
Name: r.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: time.Now(),
|
||||
Expr: r.Expr,
|
||||
// TODO: support End time
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, r.template(a)
|
||||
}
|
||||
|
||||
func (r *Rule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(r.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(r.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (r *Rule) AlertAPI(id uint64) *APIAlert {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
a, ok := r.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return r.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (r *Rule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
r.mu.RLock()
|
||||
for _, a := range r.alerts {
|
||||
alerts = append(alerts, r.newAlertAPI(*a))
|
||||
}
|
||||
r.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: r.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// AlertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
|
||||
if r.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, r.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Exec
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := r.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
a.Start = time.Unix(int64(m.Value), 0)
|
||||
r.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// Returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
// and Querier. If returnSeries is true, Exec
|
||||
// may return TimeSeries as result of execution
|
||||
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
}
|
||||
|
||||
28
app/vmalert/utils.go
Normal file
28
app/vmalert/utils.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
43
app/vmalert/utils/err_group.go
Normal file
43
app/vmalert/utils/err_group.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ErrGroup accumulates multiple errors
|
||||
// and produces single error message.
|
||||
type ErrGroup struct {
|
||||
errs []error
|
||||
}
|
||||
|
||||
// Add adds a new error to group.
|
||||
// Isn't thread-safe.
|
||||
func (eg *ErrGroup) Add(err error) {
|
||||
eg.errs = append(eg.errs, err)
|
||||
}
|
||||
|
||||
// Err checks if group contains at least
|
||||
// one error.
|
||||
func (eg *ErrGroup) Err() error {
|
||||
if eg == nil || len(eg.errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return eg
|
||||
}
|
||||
|
||||
// Error satisfies Error interface
|
||||
func (eg *ErrGroup) Error() string {
|
||||
if len(eg.errs) == 0 {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
|
||||
for i, err := range eg.errs {
|
||||
b.WriteString(err.Error())
|
||||
if i != len(eg.errs)-1 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
38
app/vmalert/utils/err_group_test.go
Normal file
38
app/vmalert/utils/err_group_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestErrGroup(t *testing.T) {
|
||||
testCases := []struct {
|
||||
errs []error
|
||||
exp string
|
||||
}{
|
||||
{nil, ""},
|
||||
{[]error{errors.New("timeout")}, "errors(1): timeout"},
|
||||
{
|
||||
[]error{errors.New("timeout"), errors.New("deadline")},
|
||||
"errors(2): timeout\ndeadline",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
eg := new(ErrGroup)
|
||||
for _, err := range tc.errs {
|
||||
eg.Add(err)
|
||||
}
|
||||
if len(tc.errs) == 0 {
|
||||
if eg.Err() != nil {
|
||||
t.Fatalf("expected to get nil error")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if eg.Err() == nil {
|
||||
t.Fatalf("expected to get non-nil error")
|
||||
}
|
||||
if eg.Error() != tc.exp {
|
||||
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
58
app/vmalert/utils/tls.go
Normal file
58
app/vmalert/utils/tls.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Transport creates http.Transport object based on provided URL.
|
||||
// Returns Transport with TLS configuration if URL contains `https` prefix
|
||||
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
|
||||
t := http.DefaultTransport.(*http.Transport).Clone()
|
||||
if !strings.HasPrefix(URL, "https") {
|
||||
return t, nil
|
||||
}
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t.TLSClientConfig = tlsCfg
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// TLSConfig creates tls.Config object from provided arguments
|
||||
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
|
||||
var certs []tls.Certificate
|
||||
if certFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
|
||||
}
|
||||
|
||||
certs = []tls.Certificate{cert}
|
||||
}
|
||||
|
||||
var rootCAs *x509.CertPool
|
||||
if CAFile != "" {
|
||||
pem, err := ioutil.ReadFile(CAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
|
||||
}
|
||||
|
||||
rootCAs = x509.NewCertPool()
|
||||
if !rootCAs.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
|
||||
}
|
||||
}
|
||||
|
||||
return &tls.Config{
|
||||
Certificates: certs,
|
||||
InsecureSkipVerify: insecureSkipVerify,
|
||||
RootCAs: rootCAs,
|
||||
ServerName: serverName,
|
||||
}, nil
|
||||
}
|
||||
52
app/vmalert/utils/tls_test.go
Normal file
52
app/vmalert/utils/tls_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package utils
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestTLSConfig(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
serverName = "test"
|
||||
insecureSkipVerify = true
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tlsCfg == nil {
|
||||
t.Errorf("expected tlsConfig to be set, got nil")
|
||||
}
|
||||
if tlsCfg.ServerName != serverName {
|
||||
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
|
||||
}
|
||||
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
|
||||
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
|
||||
}
|
||||
certFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected keypair error, got nil")
|
||||
}
|
||||
certFile = ""
|
||||
CAFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected read error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransport(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
URL := "http://victoriametrics.com"
|
||||
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
URL = "https://victoriametrics.com"
|
||||
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tr.TLSClientConfig == nil {
|
||||
t.Errorf("expected TLSClientConfig to be set, got nil")
|
||||
}
|
||||
}
|
||||
@@ -7,32 +7,18 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.Alert state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
type requestHandler struct {
|
||||
m *manager
|
||||
}
|
||||
|
||||
var pathList = [][]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
// /metrics is served by httpserver by default
|
||||
@@ -49,8 +35,11 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
}
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
resph.handle(rh.listGroups())
|
||||
return true
|
||||
case "/api/v1/alerts":
|
||||
resph.handle(rh.list())
|
||||
resph.handle(rh.listAlerts())
|
||||
return true
|
||||
case "/-/reload":
|
||||
logger.Infof("api config reload was called, sending sighup")
|
||||
@@ -67,6 +56,37 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
}
|
||||
|
||||
type listGroupsResponse struct {
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listGroupsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
sort.Slice(lr.Data.Groups, func(i, j int) bool {
|
||||
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
|
||||
})
|
||||
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
type listAlertsResponse struct {
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
@@ -74,13 +94,18 @@ type listAlertsResponse struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) list() ([]byte, error) {
|
||||
func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listAlertsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
for _, r := range g.Rules {
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
|
||||
a, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +117,7 @@ func (rh *requestHandler) list() ([]byte, error) {
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
@@ -113,11 +138,11 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
|
||||
|
||||
groupID, err := uint64FromPath(parts[0])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %s`, err))
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
|
||||
}
|
||||
alertID, err := uint64FromPath(parts[1])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %s`, err))
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
|
||||
}
|
||||
resp, err := rh.m.AlertAPI(groupID, alertID)
|
||||
if err != nil {
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
)
|
||||
|
||||
func TestHandler(t *testing.T) {
|
||||
rule := &Rule{
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {},
|
||||
@@ -19,7 +19,7 @@ func TestHandler(t *testing.T) {
|
||||
}
|
||||
g := &Group{
|
||||
Name: "group",
|
||||
Rules: []*Rule{rule},
|
||||
Rules: []Rule{ar},
|
||||
}
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m.groups[0] = g
|
||||
@@ -54,10 +54,17 @@ func TestHandler(t *testing.T) {
|
||||
t.Errorf("expected 1 alert got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/groups", func(t *testing.T) {
|
||||
lr := listGroupsResponse{}
|
||||
getResp(ts.URL+"/api/v1/groups", &lr, 200)
|
||||
if length := len(lr.Data.Groups); length != 1 {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/0/status", func(t *testing.T) {
|
||||
alert := &APIAlert{}
|
||||
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
|
||||
expAlert := rule.newAlertAPI(*rule.alerts[0])
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
|
||||
54
app/vmalert/web_types.go
Normal file
54
app/vmalert/web_types.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.AlertingRule state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
type APIAlertingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
For string `json:"for"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
}
|
||||
|
||||
// APIRecordingRule represents RecordingRule for WEB view
|
||||
type APIRecordingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
}
|
||||
@@ -23,7 +23,7 @@ Docker images for `vmauth` are available [here](https://hub.docker.com/r/victori
|
||||
|
||||
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
|
||||
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, accounting, limits, etc.
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
|
||||
|
||||
|
||||
### Auth config
|
||||
@@ -110,11 +110,11 @@ Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker i
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmauth
|
||||
ROOT_IMAGE=scratch make package-vmauth
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -36,11 +36,11 @@ type UserInfo struct {
|
||||
|
||||
func initAuthConfig() {
|
||||
if len(*authConfigPath) == 0 {
|
||||
logger.Panicf("FATAL: missing required `-auth.config` command-line flag")
|
||||
logger.Fatalf("missing required `-auth.config` command-line flag")
|
||||
}
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
}
|
||||
authConfig.Store(m)
|
||||
stopCh = make(chan struct{})
|
||||
@@ -63,12 +63,14 @@ func authConfigReloader() {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-sighupCh:
|
||||
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
|
||||
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
|
||||
continue
|
||||
}
|
||||
authConfig.Store(m)
|
||||
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,11 +82,11 @@ var stopCh chan struct{}
|
||||
func readAuthConfig(path string) (map[string]*UserInfo, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read %q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot read %q: %w", path, err)
|
||||
}
|
||||
m, err := parseAuthConfig(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse %q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
|
||||
}
|
||||
logger.Infof("Loaded information about %d users from %q", len(m), path)
|
||||
return m, nil
|
||||
@@ -93,7 +95,7 @@ func readAuthConfig(path string) (map[string]*UserInfo, error) {
|
||||
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
var ac AuthConfig
|
||||
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
|
||||
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %s", err)
|
||||
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
|
||||
}
|
||||
uis := ac.Users
|
||||
if len(uis) == 0 {
|
||||
@@ -113,7 +115,7 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
// Validate urlPrefix
|
||||
target, err := url.Parse(urlPrefix)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid `url_prefix: %q`: %s", urlPrefix, err)
|
||||
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
|
||||
}
|
||||
if target.Scheme != "http" && target.Scheme != "https" {
|
||||
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
|
||||
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
@@ -22,6 +23,7 @@ var (
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
@@ -77,6 +79,26 @@ var reverseProxy = &httputil.ReverseProxy{
|
||||
}
|
||||
r.URL = target
|
||||
},
|
||||
Transport: func() *http.Transport {
|
||||
tr := http.DefaultTransport.(*http.Transport).Clone()
|
||||
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
|
||||
tr.DisableCompression = true
|
||||
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
|
||||
tr.ForceAttemptHTTP2 = false
|
||||
return tr
|
||||
}(),
|
||||
FlushInterval: time.Second,
|
||||
ErrorLog: logger.StdErrorLogger(),
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -89,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
|
||||
|
||||
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
|
||||
|
||||
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
|
||||
|
||||
|
||||
### How does it work?
|
||||
|
||||
@@ -121,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
|
||||
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
@@ -197,9 +201,9 @@ Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` dock
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmbackup
|
||||
ROOT_IMAGE=scratch make package-vmbackup
|
||||
```
|
||||
|
||||
@@ -110,12 +110,12 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
// Verify the snapshot exists.
|
||||
f, err := os.Open(snapshotPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
|
||||
}
|
||||
fi, err := f.Stat()
|
||||
_ = f.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
|
||||
}
|
||||
if !fi.IsDir() {
|
||||
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
|
||||
@@ -126,7 +126,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -134,7 +134,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
func newDstFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*dst)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -145,7 +145,7 @@ func newOriginFS() (common.RemoteFS, error) {
|
||||
}
|
||||
fs, err := actions.NewRemoteFS(*origin)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ func (ctx *InsertCtx) AddLabel(name, value string) {
|
||||
func (ctx *InsertCtx) FlushBufs() error {
|
||||
if err := vmstorage.AddRows(ctx.mrs); err != nil {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("cannot store metrics: %s", err),
|
||||
Err: fmt.Errorf("cannot store metrics: %w", err),
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
|
||||
@@ -160,4 +161,14 @@ var (
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
_ = metrics.NewGauge(`vm_metrics_with_dropped_labels_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.MetricsWithDroppedLabels))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_names_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelNames))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_values_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelValues))
|
||||
})
|
||||
)
|
||||
|
||||
@@ -98,9 +98,9 @@ Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` do
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmrestore
|
||||
ROOT_IMAGE=scratch make package-vmrestore
|
||||
```
|
||||
|
||||
@@ -71,7 +71,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -79,7 +79,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
func newSrcFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*src)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %s", *src, err)
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %w", *src, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package vmselect
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -240,7 +241,8 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
statusCode := http.StatusUnprocessableEntity
|
||||
if esc, ok := err.(*httpserver.ErrorWithStatusCode); ok {
|
||||
var esc *httpserver.ErrorWithStatusCode
|
||||
if errors.As(err, &esc) {
|
||||
statusCode = esc.StatusCode
|
||||
}
|
||||
w.WriteHeader(statusCode)
|
||||
|
||||
@@ -7,13 +7,12 @@ import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
@@ -72,6 +71,50 @@ func (rss *Results) mustClose() {
|
||||
rss.sr = nil
|
||||
}
|
||||
|
||||
var timeseriesWorkCh = make(chan *timeseriesWork, gomaxprocs)
|
||||
|
||||
type timeseriesWork struct {
|
||||
rss *Results
|
||||
pts *packedTimeseries
|
||||
f func(rs *Result, workerID uint)
|
||||
doneCh chan error
|
||||
|
||||
rowsProcessed int
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go timeseriesWorker(uint(i))
|
||||
}
|
||||
}
|
||||
|
||||
func timeseriesWorker(workerID uint) {
|
||||
var rs Result
|
||||
var rsLastResetTime uint64
|
||||
for tsw := range timeseriesWorkCh {
|
||||
rss := tsw.rss
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
tsw.doneCh <- fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
continue
|
||||
}
|
||||
if err := tsw.pts.Unpack(&rs, rss.tr, rss.fetchData); err != nil {
|
||||
tsw.doneCh <- fmt.Errorf("error during time series unpacking: %w", err)
|
||||
continue
|
||||
}
|
||||
if len(rs.Timestamps) > 0 || !rss.fetchData {
|
||||
tsw.f(&rs, workerID)
|
||||
}
|
||||
tsw.rowsProcessed = len(rs.Values)
|
||||
tsw.doneCh <- nil
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
if cap(rs.Values) > 1024*1024 && 4*len(rs.Values) < cap(rs.Values) && currentTime-rsLastResetTime > 10 {
|
||||
// Reset rs in order to preseve memory usage after processing big time series with millions of rows.
|
||||
rs = Result{}
|
||||
rsLastResetTime = currentTime
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunParallel runs in parallel f for all the results from rss.
|
||||
//
|
||||
// f shouldn't hold references to rs after returning.
|
||||
@@ -81,72 +124,36 @@ func (rss *Results) mustClose() {
|
||||
func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
|
||||
defer rss.mustClose()
|
||||
|
||||
workersCount := 1 + len(rss.packedTimeseries)/32
|
||||
if workersCount > gomaxprocs {
|
||||
workersCount = gomaxprocs
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
workCh := make(chan *packedTimeseries, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers.
|
||||
rowsProcessedTotal := uint64(0)
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func(workerID uint) {
|
||||
rs := getResult()
|
||||
defer putResult(rs)
|
||||
maxWorkersCount := gomaxprocs / workersCount
|
||||
|
||||
var err error
|
||||
rowsProcessed := 0
|
||||
for pts := range workCh {
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
break
|
||||
}
|
||||
if err = pts.Unpack(rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
|
||||
break
|
||||
}
|
||||
if len(rs.Timestamps) == 0 && rss.fetchData {
|
||||
// Skip empty blocks.
|
||||
continue
|
||||
}
|
||||
rowsProcessed += len(rs.Values)
|
||||
f(rs, workerID)
|
||||
}
|
||||
atomic.AddUint64(&rowsProcessedTotal, uint64(rowsProcessed))
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}(uint(i))
|
||||
}
|
||||
|
||||
// Feed workers with work.
|
||||
tsws := make([]*timeseriesWork, len(rss.packedTimeseries))
|
||||
for i := range rss.packedTimeseries {
|
||||
workCh <- &rss.packedTimeseries[i]
|
||||
tsw := ×eriesWork{
|
||||
rss: rss,
|
||||
pts: &rss.packedTimeseries[i],
|
||||
f: f,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
timeseriesWorkCh <- tsw
|
||||
tsws[i] = tsw
|
||||
}
|
||||
seriesProcessedTotal := len(rss.packedTimeseries)
|
||||
rss.packedTimeseries = rss.packedTimeseries[:0]
|
||||
close(workCh)
|
||||
|
||||
// Wait until workers finish.
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete.
|
||||
var firstErr error
|
||||
rowsProcessedTotal := 0
|
||||
for _, tsw := range tsws {
|
||||
if err := <-tsw.doneCh; err != nil && firstErr == nil {
|
||||
// Return just the first error, since other errors
|
||||
// are likely duplicate the first error.
|
||||
firstErr = err
|
||||
}
|
||||
rowsProcessedTotal += tsw.rowsProcessed
|
||||
}
|
||||
|
||||
perQueryRowsProcessed.Update(float64(rowsProcessedTotal))
|
||||
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal))
|
||||
if len(errors) > 0 {
|
||||
// Return just the first error, since other errors
|
||||
// is likely duplicate the first error.
|
||||
return errors[0]
|
||||
}
|
||||
return nil
|
||||
return firstErr
|
||||
}
|
||||
|
||||
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`)
|
||||
@@ -159,70 +166,74 @@ type packedTimeseries struct {
|
||||
brs []storage.BlockRef
|
||||
}
|
||||
|
||||
var unpackWorkCh = make(chan *unpackWork, gomaxprocs)
|
||||
|
||||
type unpackWork struct {
|
||||
br storage.BlockRef
|
||||
tr storage.TimeRange
|
||||
fetchData bool
|
||||
doneCh chan error
|
||||
sb *sortBlock
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go unpackWorker()
|
||||
}
|
||||
}
|
||||
|
||||
func unpackWorker() {
|
||||
for upw := range unpackWorkCh {
|
||||
sb := getSortBlock()
|
||||
if err := sb.unpackFrom(upw.br, upw.tr, upw.fetchData); err != nil {
|
||||
putSortBlock(sb)
|
||||
upw.doneCh <- fmt.Errorf("cannot unpack block: %w", err)
|
||||
continue
|
||||
}
|
||||
upw.sb = sb
|
||||
upw.doneCh <- nil
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack unpacks pts to dst.
|
||||
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
|
||||
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool) error {
|
||||
dst.reset()
|
||||
|
||||
if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %s", pts.metricName, err)
|
||||
}
|
||||
|
||||
workersCount := 1 + len(pts.brs)/32
|
||||
if workersCount > maxWorkersCount {
|
||||
workersCount = maxWorkersCount
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
|
||||
sbs := make([]*sortBlock, 0, len(pts.brs))
|
||||
var sbsLock sync.Mutex
|
||||
|
||||
workCh := make(chan storage.BlockRef, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func() {
|
||||
var err error
|
||||
for br := range workCh {
|
||||
sb := getSortBlock()
|
||||
if err = sb.unpackFrom(br, tr, fetchData); err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
sbsLock.Lock()
|
||||
sbs = append(sbs, sb)
|
||||
sbsLock.Unlock()
|
||||
}
|
||||
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}()
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %w", pts.metricName, err)
|
||||
}
|
||||
|
||||
// Feed workers with work
|
||||
for _, br := range pts.brs {
|
||||
workCh <- br
|
||||
upws := make([]*unpackWork, len(pts.brs))
|
||||
for i, br := range pts.brs {
|
||||
upw := &unpackWork{
|
||||
br: br,
|
||||
tr: tr,
|
||||
fetchData: fetchData,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
unpackWorkCh <- upw
|
||||
upws[i] = upw
|
||||
}
|
||||
pts.brs = pts.brs[:0]
|
||||
close(workCh)
|
||||
|
||||
// Wait until workers finish
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete
|
||||
sbs := make([]*sortBlock, 0, len(pts.brs))
|
||||
var firstErr error
|
||||
for _, upw := range upws {
|
||||
if err := <-upw.doneCh; err != nil && firstErr == nil {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
firstErr = err
|
||||
}
|
||||
if firstErr == nil {
|
||||
sbs = append(sbs, upw.sb)
|
||||
} else {
|
||||
putSortBlock(upw.sb)
|
||||
}
|
||||
}
|
||||
if len(errors) > 0 {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
return errors[0]
|
||||
if firstErr != nil {
|
||||
return firstErr
|
||||
}
|
||||
|
||||
// Merge blocks
|
||||
mergeSortBlocks(dst, sbs)
|
||||
return nil
|
||||
}
|
||||
@@ -318,7 +329,7 @@ func (sb *sortBlock) unpackFrom(br storage.BlockRef, tr storage.TimeRange, fetch
|
||||
br.MustReadBlock(&sb.b, fetchData)
|
||||
if fetchData {
|
||||
if err := sb.b.UnmarshalData(); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal block: %s", err)
|
||||
return fmt.Errorf("cannot unmarshal block: %w", err)
|
||||
}
|
||||
}
|
||||
timestamps := sb.b.Timestamps()
|
||||
@@ -387,7 +398,7 @@ func DeleteSeries(sq *storage.SearchQuery) (int, error) {
|
||||
func GetLabels(deadline Deadline) ([]string, error) {
|
||||
labels, err := vmstorage.SearchTagKeys(*maxTagKeysPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during labels search: %s", err)
|
||||
return nil, fmt.Errorf("error during labels search: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -413,7 +424,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
// Search for tag values
|
||||
labelValues, err := vmstorage.SearchTagValues([]byte(labelName), *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %s", labelName, err)
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %w", labelName, err)
|
||||
}
|
||||
|
||||
// Sort labelValues like Prometheus does
|
||||
@@ -426,7 +437,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
labelEntries, err := vmstorage.SearchTagEntries(*maxTagKeysPerSearch, *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label entries request: %s", err)
|
||||
return nil, fmt.Errorf("error during label entries request: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -453,7 +464,7 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) {
|
||||
status, err := vmstorage.GetTSDBStatusForDate(date, topN)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during tsdb status request: %s", err)
|
||||
return nil, fmt.Errorf("error during tsdb status request: %w", err)
|
||||
}
|
||||
return status, nil
|
||||
}
|
||||
@@ -462,7 +473,7 @@ func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TS
|
||||
func GetSeriesCount(deadline Deadline) (uint64, error) {
|
||||
n, err := vmstorage.GetSeriesCount()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error during series count request: %s", err)
|
||||
return 0, fmt.Errorf("error during series count request: %w", err)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
@@ -495,6 +506,9 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
MinTimestamp: sq.MinTimestamp,
|
||||
MaxTimestamp: sq.MaxTimestamp,
|
||||
}
|
||||
if err := vmstorage.CheckTimeRange(tr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
vmstorage.WG.Add(1)
|
||||
defer vmstorage.WG.Done()
|
||||
@@ -518,7 +532,7 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
m[string(metricName)] = append(brs, *sr.MetricBlockRef.BlockRef)
|
||||
}
|
||||
if err := sr.Error(); err != nil {
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %s", blocksRead, err)
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %w", blocksRead, err)
|
||||
}
|
||||
|
||||
var rss Results
|
||||
@@ -537,25 +551,6 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
return &rss, nil
|
||||
}
|
||||
|
||||
func getResult() *Result {
|
||||
v := rsPool.Get()
|
||||
if v == nil {
|
||||
return &Result{}
|
||||
}
|
||||
return v.(*Result)
|
||||
}
|
||||
|
||||
func putResult(rs *Result) {
|
||||
if len(rs.Values) > 8192 {
|
||||
// Do not pool big results, since they may occupy too much memory.
|
||||
return
|
||||
}
|
||||
rs.reset()
|
||||
rsPool.Put(rs)
|
||||
}
|
||||
|
||||
var rsPool sync.Pool
|
||||
|
||||
func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error) {
|
||||
tfss := make([]*storage.TagFilters, 0, len(tagFilterss))
|
||||
for _, tagFilters := range tagFilterss {
|
||||
@@ -563,7 +558,7 @@ func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error)
|
||||
for i := range tagFilters {
|
||||
tf := &tagFilters[i]
|
||||
if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil {
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %s", tf, err)
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %w", tf, err)
|
||||
}
|
||||
}
|
||||
tfss = append(tfss, tfs)
|
||||
|
||||
@@ -46,7 +46,7 @@ const defaultStep = 5 * 60 * 1000
|
||||
func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
ct := currentTime()
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -82,7 +82,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer)
|
||||
@@ -105,7 +105,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
|
||||
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
federateDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -117,7 +117,7 @@ var federateDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/fe
|
||||
func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
ct := currentTime()
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -143,7 +143,7 @@ func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
end = start + defaultStep
|
||||
}
|
||||
if err := exportHandler(w, matches, start, end, format, maxRowsPerLine, deadline); err != nil {
|
||||
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %s", matches, start, end, err)
|
||||
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %w", matches, start, end, err)
|
||||
}
|
||||
exportDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -202,7 +202,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer, runtime.GOMAXPROCS(-1))
|
||||
@@ -227,7 +227,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
}
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -237,7 +237,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
// See https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series
|
||||
func DeleteHandler(startTime time.Time, r *http.Request) error {
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
if r.FormValue("start") != "" || r.FormValue("end") != "" {
|
||||
return fmt.Errorf("start and end aren't supported. Remove these args from the query in order to delete all the matching metrics")
|
||||
@@ -255,7 +255,7 @@ func DeleteHandler(startTime time.Time, r *http.Request) error {
|
||||
}
|
||||
deletedCount, err := netstorage.DeleteSeries(sq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete time series matching %q: %s", matches, err)
|
||||
return fmt.Errorf("cannot delete time series matching %q: %w", matches, err)
|
||||
}
|
||||
if deletedCount > 0 {
|
||||
promql.ResetRollupResultCache()
|
||||
@@ -273,14 +273,14 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
|
||||
deadline := getDeadlineForQuery(r)
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
var labelValues []string
|
||||
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
|
||||
var err error
|
||||
labelValues, err = netstorage.GetLabelValues(labelName, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain label values for %q: %s`, labelName, err)
|
||||
return fmt.Errorf(`cannot obtain label values for %q: %w`, labelName, err)
|
||||
}
|
||||
} else {
|
||||
// Extended functionality that allows filtering by label filters and time range
|
||||
@@ -302,7 +302,7 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
|
||||
}
|
||||
labelValues, err = labelValuesWithMatches(labelName, matches, start, end, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %s", labelName, matches, start, end, err)
|
||||
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %w", labelName, matches, start, end, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -343,7 +343,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
m := make(map[string]struct{})
|
||||
@@ -358,7 +358,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
|
||||
mLock.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error when data fetching: %s", err)
|
||||
return nil, fmt.Errorf("error when data fetching: %w", err)
|
||||
}
|
||||
|
||||
labelValues := make([]string, 0, len(m))
|
||||
@@ -376,7 +376,7 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
|
||||
deadline := getDeadlineForQuery(r)
|
||||
labelEntries, err := netstorage.GetLabelEntries(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain label entries: %s`, err)
|
||||
return fmt.Errorf(`cannot obtain label entries: %w`, err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteLabelsCountResponse(w, labelEntries)
|
||||
@@ -394,14 +394,14 @@ const secsPerDay = 3600 * 24
|
||||
func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
deadline := getDeadlineForQuery(r)
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
date := fasttime.UnixDate()
|
||||
dateStr := r.FormValue("date")
|
||||
if len(dateStr) > 0 {
|
||||
t, err := time.Parse("2006-01-02", dateStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
|
||||
return fmt.Errorf("cannot parse `date` arg %q: %w", dateStr, err)
|
||||
}
|
||||
date = uint64(t.Unix()) / secsPerDay
|
||||
}
|
||||
@@ -410,7 +410,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
if len(topNStr) > 0 {
|
||||
n, err := strconv.Atoi(topNStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse `topN` arg %q: %s", topNStr, err)
|
||||
return fmt.Errorf("cannot parse `topN` arg %q: %w", topNStr, err)
|
||||
}
|
||||
if n <= 0 {
|
||||
n = 1
|
||||
@@ -422,7 +422,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
}
|
||||
status, err := netstorage.GetTSDBStatusForDate(deadline, date, topN)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
|
||||
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %w`, date, topN, err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteTSDBStatusResponse(w, status)
|
||||
@@ -439,14 +439,14 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
deadline := getDeadlineForQuery(r)
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
var labels []string
|
||||
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
|
||||
var err error
|
||||
labels, err = netstorage.GetLabels(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain labels: %s", err)
|
||||
return fmt.Errorf("cannot obtain labels: %w", err)
|
||||
}
|
||||
} else {
|
||||
// Extended functionality that allows filtering by label filters and time range
|
||||
@@ -466,7 +466,7 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
labels, err = labelsWithMatches(matches, start, end, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %s", matches, start, end, err)
|
||||
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %w", matches, start, end, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -494,7 +494,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
m := make(map[string]struct{})
|
||||
@@ -510,7 +510,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
|
||||
mLock.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error when data fetching: %s", err)
|
||||
return nil, fmt.Errorf("error when data fetching: %w", err)
|
||||
}
|
||||
|
||||
labels := make([]string, 0, len(m))
|
||||
@@ -528,7 +528,7 @@ func SeriesCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
|
||||
deadline := getDeadlineForQuery(r)
|
||||
n, err := netstorage.GetSeriesCount(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain series count: %s", err)
|
||||
return fmt.Errorf("cannot obtain series count: %w", err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteSeriesCountResponse(w, n)
|
||||
@@ -545,7 +545,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
ct := currentTime()
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -580,7 +580,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer)
|
||||
@@ -605,7 +605,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
seriesDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -652,17 +652,17 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
if childQuery, windowStr, offsetStr := promql.IsMetricSelectorWithRollup(query); childQuery != "" {
|
||||
window, err := parsePositiveDuration(windowStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse window: %s", err)
|
||||
return fmt.Errorf("cannot parse window: %w", err)
|
||||
}
|
||||
offset, err := parseDuration(offsetStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse offset: %s", err)
|
||||
return fmt.Errorf("cannot parse offset: %w", err)
|
||||
}
|
||||
start -= offset
|
||||
end := start
|
||||
start = end - window
|
||||
if err := exportHandler(w, []string{childQuery}, start, end, "promapi", 0, deadline); err != nil {
|
||||
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %s", childQuery, start, end, err)
|
||||
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %w", childQuery, start, end, err)
|
||||
}
|
||||
queryDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -670,24 +670,24 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
if childQuery, windowStr, stepStr, offsetStr := promql.IsRollup(query); childQuery != "" {
|
||||
newStep, err := parsePositiveDuration(stepStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse step: %s", err)
|
||||
return fmt.Errorf("cannot parse step: %w", err)
|
||||
}
|
||||
if newStep > 0 {
|
||||
step = newStep
|
||||
}
|
||||
window, err := parsePositiveDuration(windowStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse window: %s", err)
|
||||
return fmt.Errorf("cannot parse window: %w", err)
|
||||
}
|
||||
offset, err := parseDuration(offsetStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse offset: %s", err)
|
||||
return fmt.Errorf("cannot parse offset: %w", err)
|
||||
}
|
||||
start -= offset
|
||||
end := start
|
||||
start = end - window
|
||||
if err := queryRangeHandler(w, childQuery, start, end, step, r, ct); err != nil {
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", childQuery, start, end, step, err)
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", childQuery, start, end, step, err)
|
||||
}
|
||||
queryDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -702,7 +702,7 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
}
|
||||
result, err := promql.Exec(&ec, query, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %s", query, start, step, err)
|
||||
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %w", query, start, step, err)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
@@ -750,7 +750,7 @@ func QueryRangeHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
return err
|
||||
}
|
||||
if err := queryRangeHandler(w, query, start, end, step, r, ct); err != nil {
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", query, start, end, step, err)
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", query, start, end, step, err)
|
||||
}
|
||||
queryRangeDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -788,7 +788,7 @@ func queryRangeHandler(w http.ResponseWriter, query string, start, end, step int
|
||||
}
|
||||
result, err := promql.Exec(&ec, query, false)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot execute query: %s", err)
|
||||
return fmt.Errorf("cannot execute query: %w", err)
|
||||
}
|
||||
queryOffset := getLatencyOffsetMilliseconds()
|
||||
if ct-end < queryOffset {
|
||||
@@ -897,7 +897,7 @@ func getTime(r *http.Request, argKey string, defaultValue int64) (int64, error)
|
||||
// Try parsing duration relative to the current time
|
||||
d, err1 := time.ParseDuration(argValue)
|
||||
if err1 != nil {
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
|
||||
}
|
||||
if d > 0 {
|
||||
d = -d
|
||||
@@ -939,7 +939,7 @@ func getDuration(r *http.Request, argKey string, defaultValue int64) (int64, err
|
||||
// Try parsing string format
|
||||
d, err := time.ParseDuration(argValue)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
|
||||
}
|
||||
secs = d.Seconds()
|
||||
}
|
||||
@@ -1001,7 +1001,7 @@ func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error)
|
||||
for _, match := range matches {
|
||||
tagFilters, err := promql.ParseMetricSelector(match)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse %q: %s", match, err)
|
||||
return nil, fmt.Errorf("cannot parse %q: %w", match, err)
|
||||
}
|
||||
tagFilterss = append(tagFilterss, tagFilters)
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ func testIncrementalParallelAggr(iafc *incrementalAggrFuncContext, tssSrc, tssEx
|
||||
wg.Wait()
|
||||
tssActual := iafc.finalizeTimeseries()
|
||||
if err := expectTimeseriesEqual(tssActual, tssExpected); err != nil {
|
||||
return fmt.Errorf("%s; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
|
||||
return fmt.Errorf("%w; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -164,7 +164,7 @@ func expectTsEqual(actual, expected *timeseries) error {
|
||||
return fmt.Errorf("unexpected timestamps; got %v; want %v", actual.Timestamps, expected.Timestamps)
|
||||
}
|
||||
if err := compareValues(actual.Values, expected.Values); err != nil {
|
||||
return fmt.Errorf("%s; actual %v; expected %v", err, actual.Values, expected.Values)
|
||||
return fmt.Errorf("%w; actual %v; expected %v", err, actual.Values, expected.Values)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -206,7 +206,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
|
||||
resetMetricGroupIfRequired(be, tsLeft)
|
||||
if len(tssRight) == 1 {
|
||||
// Easy case - right part contains only a single matching time series.
|
||||
tsLeft.MetricName.AddMissingTags(joinTags, &tssRight[0].MetricName)
|
||||
tsLeft.MetricName.SetTags(joinTags, &tssRight[0].MetricName)
|
||||
rvsLeft = append(rvsLeft, tsLeft)
|
||||
rvsRight = append(rvsRight, tssRight[0])
|
||||
continue
|
||||
@@ -225,7 +225,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
|
||||
for _, tsRight := range tssRight {
|
||||
var tsCopy timeseries
|
||||
tsCopy.CopyFromShallowTimestamps(tsLeft)
|
||||
tsCopy.MetricName.AddMissingTags(joinTags, &tsRight.MetricName)
|
||||
tsCopy.MetricName.SetTags(joinTags, &tsRight.MetricName)
|
||||
bb.B = marshalMetricTagsSorted(bb.B[:0], &tsCopy.MetricName)
|
||||
if tsExisting := m[string(bb.B)]; tsExisting != nil {
|
||||
// Try merging tsExisting with tsRight if they don't overlap.
|
||||
@@ -310,9 +310,22 @@ func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
|
||||
for _, tss := range mLeft {
|
||||
rvs = append(rvs, tss...)
|
||||
}
|
||||
for k, tss := range mRight {
|
||||
if mLeft[k] == nil {
|
||||
rvs = append(rvs, tss...)
|
||||
for k, tssRight := range mRight {
|
||||
tssLeft := mLeft[k]
|
||||
if tssLeft == nil {
|
||||
rvs = append(rvs, tssRight...)
|
||||
continue
|
||||
}
|
||||
// Fill gaps in tssLeft with values from tssRight as Prometheus does.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/552
|
||||
valuesRight := tssRight[0].Values
|
||||
for _, tsLeft := range tssLeft {
|
||||
valuesLeft := tsLeft.Values
|
||||
for i, v := range valuesLeft {
|
||||
if math.IsNaN(v) {
|
||||
valuesLeft[i] = valuesRight[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return rvs, nil
|
||||
|
||||
@@ -160,14 +160,14 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, me.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, me.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
if re, ok := e.(*metricsql.RollupExpr); ok {
|
||||
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, re.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, re.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -189,7 +189,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := tf(tfa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -203,7 +203,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := evalRollupFunc(ec, fe.Name, rf, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -240,7 +240,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := af(afa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, ae.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, ae.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -264,7 +264,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := bf(bfa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, be.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, be.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -375,7 +375,7 @@ func evalRollupFuncArgs(ec *EvalConfig, fe *metricsql.FuncExpr) ([]interface{},
|
||||
}
|
||||
ts, err := evalExpr(ec, arg)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %s", i+1, fe.AppendString(nil), err)
|
||||
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %w", i+1, fe.AppendString(nil), err)
|
||||
}
|
||||
args[i] = ts
|
||||
}
|
||||
|
||||
@@ -1480,7 +1480,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
|
||||
t.Run(`label_replace(nonexisting_src)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".+")`
|
||||
r := netstorage.Result{
|
||||
@@ -1491,6 +1491,21 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(label_set(time(), "foo", "foobar"), "__name__", "x${1}y", "foo", "bar(.+)")`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("foobar"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(match)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".*")`
|
||||
@@ -1849,6 +1864,17 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`scalar or scalar`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `time() > 1400 or 123`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{123, 123, 123, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`timseries-with-tags unless 2`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_set(time(), "foo", "bar") unless 2`
|
||||
@@ -1988,25 +2014,37 @@ func TestExecSuccess(t *testing.T) {
|
||||
})
|
||||
t.Run(`scalar * ignoring(foo) group_right vector`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(2 * ignoring(foo) group_right(a,foo) (label_set(time(), "foo", "bar") or label_set(10, "foo", "qwert")))`
|
||||
q := `sort_desc(label_set(2, "a", "2") * ignoring(foo,a) group_right(a) (label_set(time(), "foo", "bar", "a", "1"), label_set(10, "foo", "qwert")))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{2000, 2400, 2800, 3200, 3600, 4000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
}}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
{
|
||||
Key: []byte("a"),
|
||||
Value: []byte("2"),
|
||||
},
|
||||
{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
},
|
||||
}
|
||||
r2 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{20, 20, 20, 20, 20, 20},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r2.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("qwert"),
|
||||
}}
|
||||
r2.MetricName.Tags = []storage.Tag{
|
||||
{
|
||||
Key: []byte("a"),
|
||||
Value: []byte("2"),
|
||||
},
|
||||
{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("qwert"),
|
||||
},
|
||||
}
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
@@ -2321,9 +2359,9 @@ func TestExecSuccess(t *testing.T) {
|
||||
t.Run(`vector + vector on group_left matching`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(
|
||||
(label_set(time(), "t1", "v123", "t2", "v3") or label_set(10, "t2", "v3", "xxx", "yy"))
|
||||
(label_set(time(), "t1", "v123", "t2", "v3"), label_set(10, "t2", "v3", "xxx", "yy"))
|
||||
+ on (foo, t2) group_left (t1, noxxx)
|
||||
(label_set(100, "t1", "v1") or label_set(time(), "t2", "v3", "noxxx", "aa"))
|
||||
(label_set(100, "t1", "v1"), label_set(time(), "t2", "v3", "noxxx", "aa"))
|
||||
)`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
@@ -2335,10 +2373,6 @@ func TestExecSuccess(t *testing.T) {
|
||||
Key: []byte("noxxx"),
|
||||
Value: []byte("aa"),
|
||||
},
|
||||
{
|
||||
Key: []byte("t1"),
|
||||
Value: []byte("v123"),
|
||||
},
|
||||
{
|
||||
Key: []byte("t2"),
|
||||
Value: []byte("v3"),
|
||||
|
||||
@@ -285,7 +285,7 @@ func getRollupConfigs(name string, rf rollupFunc, expr metricsql.Expr, start, en
|
||||
case "aggr_over_time":
|
||||
aggrFuncNames, err := getRollupAggrFuncNames(expr)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("invalid args to %s: %s", expr.AppendString(nil), err)
|
||||
return nil, nil, fmt.Errorf("invalid args to %s: %w", expr.AppendString(nil), err)
|
||||
}
|
||||
for _, aggrFuncName := range aggrFuncNames {
|
||||
if rollupFuncsRemoveCounterResets[aggrFuncName] {
|
||||
|
||||
@@ -286,7 +286,7 @@ var (
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
// do not use logger.Panicf, since it isn't initialized yet.
|
||||
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %s", err))
|
||||
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %w", err))
|
||||
}
|
||||
return encoding.UnmarshalUint64(buf[:])
|
||||
}()
|
||||
@@ -414,7 +414,7 @@ func (mi *rollupResultCacheMetainfo) Unmarshal(src []byte) error {
|
||||
for i := 0; i < entriesLen; i++ {
|
||||
tail, err := mi.entries[i].Unmarshal(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot unmarshal entry #%d: %s", i, err)
|
||||
return fmt.Errorf("cannot unmarshal entry #%d: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
}
|
||||
|
||||
@@ -217,7 +217,7 @@ func (ts *timeseries) unmarshalFastNoTimestamps(src []byte) ([]byte, error) {
|
||||
|
||||
tail, err := unmarshalMetricNameFast(&ts.MetricName, src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricName: %s", err)
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricName: %w", err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
@@ -275,7 +275,7 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
|
||||
|
||||
tail, metricGroup, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %s", err)
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %w", err)
|
||||
}
|
||||
src = tail
|
||||
mn.MetricGroup = metricGroup[:len(metricGroup):len(metricGroup)]
|
||||
@@ -292,13 +292,13 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
|
||||
for i := range mn.Tags {
|
||||
tail, key, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %s", i, err)
|
||||
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
tail, value, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %s", i, err)
|
||||
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
|
||||
@@ -414,7 +414,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
les, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse le: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse le: %w", err)
|
||||
}
|
||||
|
||||
// Convert buckets with `vmrange` labels to buckets with `le` labels.
|
||||
@@ -425,7 +425,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
if len(args) > 2 {
|
||||
s, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
|
||||
}
|
||||
boundsLabel = s
|
||||
}
|
||||
@@ -513,7 +513,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
phis, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse phi: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse phi: %w", err)
|
||||
}
|
||||
|
||||
// Convert buckets with `vmrange` labels to buckets with `le` labels.
|
||||
@@ -524,7 +524,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
if len(args) > 2 {
|
||||
s, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
|
||||
}
|
||||
boundsLabel = s
|
||||
}
|
||||
@@ -1034,7 +1034,7 @@ func transformLabelMap(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
label, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot read label name: %w", err)
|
||||
}
|
||||
srcValues, dstValues, err := getStringPairs(args[2:])
|
||||
if err != nil {
|
||||
@@ -1179,7 +1179,7 @@ func transformLabelTransform(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
|
||||
r, err := metricsql.CompileRegexp(regex)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
|
||||
}
|
||||
return labelReplace(args[0], label, r, label, replacement)
|
||||
}
|
||||
@@ -1208,7 +1208,7 @@ func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
|
||||
r, err := metricsql.CompileRegexpAnchored(regex)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
|
||||
}
|
||||
return labelReplace(args[0], srcLabel, r, dstLabel, replacement)
|
||||
}
|
||||
@@ -1219,6 +1219,9 @@ func labelReplace(tss []*timeseries, srcLabel string, r *regexp.Regexp, dstLabel
|
||||
mn := &ts.MetricName
|
||||
dstValue := getDstValue(mn, dstLabel)
|
||||
srcValue := mn.GetTagValue(srcLabel)
|
||||
if !r.Match(srcValue) {
|
||||
continue
|
||||
}
|
||||
b := r.ReplaceAll(srcValue, replacementBytes)
|
||||
*dstValue = append((*dstValue)[:0], b...)
|
||||
if len(b) == 0 {
|
||||
@@ -1235,7 +1238,7 @@ func transformLabelValue(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
rvs := args[0]
|
||||
for _, ts := range rvs {
|
||||
@@ -1262,15 +1265,15 @@ func transformLabelMatch(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
labelRe, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get regexp: %s", err)
|
||||
return nil, fmt.Errorf("cannot get regexp: %w", err)
|
||||
}
|
||||
r, err := metricsql.CompileRegexpAnchored(labelRe)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
|
||||
}
|
||||
tss := args[0]
|
||||
rvs := tss[:0]
|
||||
@@ -1290,15 +1293,15 @@ func transformLabelMismatch(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
labelRe, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get regexp: %s", err)
|
||||
return nil, fmt.Errorf("cannot get regexp: %w", err)
|
||||
}
|
||||
r, err := metricsql.CompileRegexpAnchored(labelRe)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
|
||||
}
|
||||
tss := args[0]
|
||||
rvs := tss[:0]
|
||||
@@ -1398,7 +1401,7 @@ func newTransformFuncSortByLabel(isDesc bool) transformFunc {
|
||||
}
|
||||
label, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse label name for sorting: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse label name for sorting: %w", err)
|
||||
}
|
||||
rvs := args[0]
|
||||
sort.SliceStable(rvs, func(i, j int) bool {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -28,8 +29,27 @@ var (
|
||||
|
||||
bigMergeConcurrency = flag.Int("bigMergeConcurrency", 0, "The maximum number of CPU cores to use for big merges. Default value is used if set to 0")
|
||||
smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of CPU cores to use for small merges. Default value is used if set to 0")
|
||||
|
||||
denyQueriesOutsideRetention = flag.Bool("denyQueriesOutsideRetention", false, "Whether to deny queries outside of the configured -retentionPeriod. "+
|
||||
"When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. "+
|
||||
"This may be useful when multiple data sources with distinct retentions are hidden behind query-tee")
|
||||
)
|
||||
|
||||
// CheckTimeRange returns true if the given tr is denied for querying.
|
||||
func CheckTimeRange(tr storage.TimeRange) error {
|
||||
if !*denyQueriesOutsideRetention {
|
||||
return nil
|
||||
}
|
||||
minAllowedTimestamp := (int64(fasttime.UnixTimestamp()) - int64(*retentionPeriod)*3600*24*30) * 1000
|
||||
if tr.MinTimestamp > minAllowedTimestamp {
|
||||
return nil
|
||||
}
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("the given time range %s is outside the allowed retention of %d months according to -denyQueriesOutsideRetention", &tr, *retentionPeriod),
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
}
|
||||
}
|
||||
|
||||
// Init initializes vmstorage.
|
||||
func Init() {
|
||||
InitWithoutMetrics()
|
||||
@@ -171,7 +191,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshotPath, err := Storage.CreateSnapshot()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot create snapshot: %s", err)
|
||||
err = fmt.Errorf("cannot create snapshot: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -185,7 +205,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshots, err := Storage.ListSnapshots()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot list snapshots: %s", err)
|
||||
err = fmt.Errorf("cannot list snapshots: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -202,7 +222,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshotName := r.FormValue("snapshot")
|
||||
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -212,13 +232,13 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshots, err := Storage.ListSnapshots()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot list snapshots: %s", err)
|
||||
err = fmt.Errorf("cannot list snapshots: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
for _, snapshotName := range snapshots {
|
||||
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -1,20 +1,11 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__inputs": [],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "6.7.2"
|
||||
"version": "7.0.3"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
@@ -36,8 +27,8 @@
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "table",
|
||||
"name": "Table",
|
||||
"id": "table-old",
|
||||
"name": "Table (old)",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
@@ -65,7 +56,7 @@
|
||||
"gnetId": 10229,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"iteration": 1589923637424,
|
||||
"iteration": 1593345560631,
|
||||
"links": [
|
||||
{
|
||||
"icon": "doc",
|
||||
@@ -96,7 +87,7 @@
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
@@ -110,8 +101,14 @@
|
||||
},
|
||||
{
|
||||
"content": "<div style=\"text-align: center; font-size: 2em\">$version</div>",
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 8,
|
||||
@@ -134,8 +131,14 @@
|
||||
},
|
||||
{
|
||||
"columns": [],
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Run VM with `-help` flag to see all the available flags with description and default values",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fontSize": "100%",
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
@@ -216,7 +219,7 @@
|
||||
"timeShift": null,
|
||||
"title": "Flags",
|
||||
"transform": "table",
|
||||
"type": "table"
|
||||
"type": "table-old"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
@@ -227,8 +230,14 @@
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "How many datapoints are in storage",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"format": "short",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
@@ -311,8 +320,14 @@
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"format": "short",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
@@ -395,7 +410,13 @@
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"format": "s",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
@@ -444,7 +465,7 @@
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "vm_app_uptime_seconds{job=\"$job\"}",
|
||||
@@ -470,7 +491,7 @@
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
@@ -487,8 +508,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -581,8 +608,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -675,8 +708,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -772,8 +811,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -877,8 +922,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows how many ongoing insertions are taking place.\n* `max` - equal to number of CPU * 2\n* `current` - current number of goroutines busy with inserting rows into storage\n\nWhen `current` hits `max` constantly, it means storage is overloaded and require more CPU.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -987,8 +1038,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1078,7 +1135,7 @@
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
@@ -1092,8 +1149,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "How many datapoints are inserted into storage per second",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1192,8 +1255,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*<ingestion_rate>`, since VictoriaMetrics pushes pending data to persistent storage every second.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1298,8 +1367,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1349,7 +1424,7 @@
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(vm_rows{job=\"$job\", type != \"indexdb\"}) / sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"})",
|
||||
"expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", type != \"indexdb\"})",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
@@ -1404,8 +1479,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1495,8 +1576,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1597,8 +1684,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows amount of on-disk space occupied by inverted index.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1687,8 +1780,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1776,8 +1875,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "The number of rows merged per second by storage nodes.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1865,8 +1970,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -1958,8 +2069,14 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -2052,7 +2169,7 @@
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
@@ -2066,15 +2183,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
"y": 4
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 66,
|
||||
@@ -2154,15 +2277,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
"y": 4
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 60,
|
||||
@@ -2247,15 +2376,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
"y": 12
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 68,
|
||||
@@ -2342,7 +2477,7 @@
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
@@ -2356,15 +2491,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 44,
|
||||
@@ -2464,14 +2605,20 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 44
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 57,
|
||||
@@ -2555,14 +2702,20 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 52
|
||||
"y": 13
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 47,
|
||||
@@ -2646,14 +2799,20 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 52
|
||||
"y": 13
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 42,
|
||||
@@ -2736,14 +2895,20 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 60
|
||||
"y": 21
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 48,
|
||||
@@ -2827,15 +2992,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 60
|
||||
"y": 21
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 37,
|
||||
@@ -2920,15 +3091,21 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 68
|
||||
"y": 29
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 49,
|
||||
@@ -3014,24 +3191,42 @@
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 22,
|
||||
"schemaVersion": 25,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "label_values(vm_app_version, job)",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "VictoriaMetrics",
|
||||
"value": "VictoriaMetrics"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"index": -1,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [],
|
||||
"query": "label_values(vm_app_version, job)",
|
||||
"query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
@@ -3045,11 +3240,10 @@
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(vm_app_version{job=\"$job\"}, version)",
|
||||
"hide": 2,
|
||||
"includeAll": false,
|
||||
"index": -1,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "version",
|
||||
@@ -3073,7 +3267,6 @@
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
@@ -3099,8 +3292,5 @@
|
||||
"timezone": "",
|
||||
"title": "VictoriaMetrics",
|
||||
"uid": "wNf0q_kZk",
|
||||
"variables": {
|
||||
"list": []
|
||||
},
|
||||
"version": 1
|
||||
}
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
DOCKER_NAMESPACE := victoriametrics
|
||||
|
||||
ROOT_IMAGE ?= scratch
|
||||
CERTS_IMAGE := alpine:3.11
|
||||
GO_BUILDER_IMAGE := golang:1.14.3
|
||||
ROOT_IMAGE ?= alpine:3.12
|
||||
CERTS_IMAGE := alpine:3.12
|
||||
GO_BUILDER_IMAGE := golang:1.14.4
|
||||
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _)
|
||||
BASE_IMAGE := local/base:1.1.1-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _)
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
#### Docker compose
|
||||
|
||||
To spin-up setup of VictoriaMetrics, Prometheus and Grafana run following command:
|
||||
To spin-up setup of VictoriaMetrics, vmagent and Grafana run following command:
|
||||
|
||||
`docker-compose up`
|
||||
|
||||
@@ -13,11 +13,11 @@ VictoriaMetrics opens following ports:
|
||||
* `--opentsdbListenAddr=:4242`
|
||||
* `--httpListenAddr=:8428`
|
||||
|
||||
##### Prometheus
|
||||
##### vmagent
|
||||
|
||||
To access service open following [link](http://localhost:9090).
|
||||
|
||||
Prometheus is already configured to use VictoriaMetrics as remote storage.
|
||||
vmagent is used for scraping and pushing timeseries to
|
||||
VictoriaMetrics instance. It accepts Prometheus-compatible
|
||||
configuration `prometheus.yml` with listed targets for scraping.
|
||||
|
||||
##### Grafana
|
||||
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: '3.5'
|
||||
services:
|
||||
prometheus:
|
||||
container_name: prometheus
|
||||
image: prom/prometheus:v2.17.2
|
||||
vmagent:
|
||||
container_name: vmagent
|
||||
image: victoriametrics/vmagent
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
- 9090:9090
|
||||
- 8429:8429
|
||||
volumes:
|
||||
- promdata:/prometheus
|
||||
- vmagentdata:/vmagentdata
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--promscrape.config=/etc/prometheus/prometheus.yml'
|
||||
- '--remoteWrite.url=http://victoriametrics:8428/api/v1/write'
|
||||
networks:
|
||||
- vm_net
|
||||
restart: always
|
||||
@@ -35,13 +35,7 @@ services:
|
||||
restart: always
|
||||
grafana:
|
||||
container_name: grafana
|
||||
image: grafana/grafana:6.7.2
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
cd /var/lib/grafana &&
|
||||
mkdir -p dashboards &&
|
||||
sed 's/$${DS_PROMETHEUS}/Prometheus/g' vm.json > dashboards/vm.json &&
|
||||
/run.sh"
|
||||
image: grafana/grafana:7.0.3
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -49,12 +43,12 @@ services:
|
||||
volumes:
|
||||
- grafanadata:/var/lib/grafana
|
||||
- ./provisioning/:/etc/grafana/provisioning/
|
||||
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/vm.json
|
||||
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
|
||||
networks:
|
||||
- vm_net
|
||||
restart: always
|
||||
volumes:
|
||||
promdata: {}
|
||||
vmagentdata: {}
|
||||
vmdata: {}
|
||||
grafanadata: {}
|
||||
networks:
|
||||
|
||||
@@ -1,16 +1,10 @@
|
||||
global:
|
||||
scrape_interval: 10s
|
||||
evaluation_interval: 10s
|
||||
|
||||
remote_write:
|
||||
- url: "http://victoriametrics:8428/api/v1/write"
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
- job_name: 'vmagent'
|
||||
static_configs:
|
||||
- targets: ['prometheus:9090']
|
||||
- targets: ['vmagent:8429']
|
||||
- job_name: 'victoriametrics'
|
||||
static_configs:
|
||||
- targets: ['victoriametrics:8428']
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
|
||||
- name: VictoriaMetrics
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://victoriametrics:8428
|
||||
isDefault: false
|
||||
isDefault: true
|
||||
|
||||
@@ -37,3 +37,4 @@
|
||||
* [Benchmarking time series workloads on Apache Kudu using TSBS](https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/)
|
||||
* [What are Open Source Time Series Databases?](https://www.iunera.com/kraken/fabric/time-series-database/)
|
||||
* [Evaluating performance and correctness](https://www.robustperception.io/evaluating-performance-and-correctness)
|
||||
* [Running VictoriaMetrics on Raspberry PI](https://stas.starikevich.com/posts/raspberry-pi-4-prometheus/)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<img alt="Victoria Metrics" src="logo.png">
|
||||
|
||||
# Cluster version
|
||||
|
||||
<img alt="Victoria Metrics" src="logo.png">
|
||||
|
||||
VictoriaMetrics is fast, cost-effective and scalable time series database. It can be used as a long-term remote storage for Prometheus.
|
||||
|
||||
It is recommended using [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics) instead of cluster version
|
||||
@@ -112,11 +112,12 @@ Run `make package`. It will build the following docker images locally:
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package`.
|
||||
|
||||
By default images are built on top of `scratch` image. It is possible to build on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds images on top of `alpine:3.11` image:
|
||||
By default images are built on top of [alpine](https://hub.docker.com/_/scratch) image in order to improve debuggability.
|
||||
It is possible to build an image on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
|
||||
For example, the following command builds images on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package
|
||||
ROOT_IMAGE=scratch make package
|
||||
```
|
||||
|
||||
## Operation
|
||||
@@ -225,15 +226,6 @@ Steps to add `vmstorage` node:
|
||||
3. Gradually restart all the `vminsert` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8400`.
|
||||
|
||||
|
||||
### Cluster availability
|
||||
|
||||
* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
|
||||
* The cluster remains available if at least a single `vmstorage` node exists:
|
||||
|
||||
- `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
|
||||
- `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.
|
||||
|
||||
|
||||
### Updating / reconfiguring cluster nodes
|
||||
|
||||
All the node types - `vminsert`, `vmselect` and `vmstorage` - may be updated via graceful shutdown.
|
||||
@@ -244,6 +236,17 @@ Cluster should remain in working state if at least a single node of each type re
|
||||
the update process. See [cluster availability](#cluster-availability) section for details.
|
||||
|
||||
|
||||
### Cluster availability
|
||||
|
||||
* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
|
||||
* The cluster remains available if at least a single `vmstorage` node exists:
|
||||
|
||||
- `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
|
||||
- `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.
|
||||
|
||||
Data replication can be used for increasing storage durability. See [these docs](#replication-and-data-safety) for details.
|
||||
|
||||
|
||||
### Capacity planning
|
||||
|
||||
Each instance type - `vminsert`, `vmselect` and `vmstorage` - can run on the most suitable hardware.
|
||||
@@ -276,6 +279,18 @@ In general it is recommended increasing the number of vCPU cores and RAM per `vm
|
||||
while adding new `vmselect` nodes only when old nodes are overloaded with incoming query stream.
|
||||
|
||||
|
||||
### High availability
|
||||
|
||||
It is recommended to run all the components for a single cluster in the same subnetwork with high bandwidth, low latency and low error rates.
|
||||
This improves cluster performance and availability.
|
||||
It isn't recommended spreading components for a single cluster across multiple availability zones, since cross-AZ network usually has lower bandwidth, higher latency
|
||||
and higher error rates comparing the network inside AZ.
|
||||
|
||||
If you need multi-AZ setup, then it is recommended running independed clusters in each AZ and setting up
|
||||
[vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) in front of these clusters, so it could replicate incoming data
|
||||
into all the cluster. Then [promxy](https://github.com/jacksontj/promxy) could be used for querying the data from multiple clusters.
|
||||
|
||||
|
||||
### Helm
|
||||
|
||||
Helm chart simplifies managing cluster version of VictoriaMetrics in Kubernetes.
|
||||
@@ -286,6 +301,17 @@ Upgrade follows `Cluster resizing procedure` under the hood.
|
||||
|
||||
### Replication and data safety
|
||||
|
||||
In order to enable application-level replication, `-replicationFactor=N` command-line flag must be passed to `vminsert`.
|
||||
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
|
||||
For example, when `-replicationFactor=3` is passed to `vminsert`, then it replicates all the ingested data to 3 distinct `vmstorage` nodes.
|
||||
|
||||
When the replication is enabled, `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect`
|
||||
in order to de-duplicate replicated data during queries. It is OK if `-dedup.minScrapeInterval` exceeds 1ms
|
||||
when [deduplication](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#deduplication) is used additionally to replication.
|
||||
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
|
||||
so it is recommended performing regular backups. See [these docs](#backups) for details.
|
||||
|
||||
By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
|
||||
It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs),
|
||||
since they are protected from data loss and data corruption. They also provide consistently high performance
|
||||
@@ -294,14 +320,6 @@ HDD-based persistent disks should be enough for the majority of use cases.
|
||||
|
||||
It is recommended using durable replicated persistent volumes in Kubernetes.
|
||||
|
||||
If `-replicationFactor=N` command-line flag is passed to `vminsert`, then `vminsert` puts `N` copies of the ingested data to distinct `vmstorage` nodes.
|
||||
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable. Note that `-dedup.minScrapeInterval=1ms` command-line
|
||||
flag must be passed to `vmselect` if `-replicationFactor` exceeds 1 in order to de-duplicate replicated data during queries.
|
||||
It is OK if `-dedup.minScrapeInterval` exceeds 1ms.
|
||||
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
|
||||
so it is recommended performing regular backups. See [these docs](#backups) for details.
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
@@ -341,8 +359,7 @@ Due to `KISS` cluster version of VictoriaMetrics has no the following "features"
|
||||
|
||||
- Fragile gossip protocols. See [failed attempt in Thanos](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
|
||||
- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm).
|
||||
- Complex replication schemes, which may go nuts in unforesseen edge cases. The replication is offloaded to the underlying durable replicated storage
|
||||
such as [persistent disks in Google Compute Engine](https://cloud.google.com/compute/docs/disks/#pdspecs).
|
||||
- Complex replication schemes, which may go nuts in unforesseen edge cases. See [replication docs](#replication-and-data-safety) for details.
|
||||
- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability.
|
||||
- Automatic cluster resizing, which may cost you a lot of money if improperly configured.
|
||||
- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :)
|
||||
|
||||
131
docs/FAQ.md
131
docs/FAQ.md
@@ -2,64 +2,64 @@
|
||||
|
||||
### What is the main purpose of VictoriaMetrics?
|
||||
|
||||
To provide the best long-term [remote storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) solution for [Prometheus](https://prometheus.io/).
|
||||
To provide the best monitoring solution.
|
||||
|
||||
|
||||
### Who uses VictoriaMetrics?
|
||||
|
||||
See [case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
|
||||
|
||||
|
||||
### Which features does VictoriaMetrics have?
|
||||
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL).
|
||||
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
|
||||
and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4).
|
||||
[Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality).
|
||||
* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
|
||||
may be crammed into a limited storage comparing to TimescaleDB.
|
||||
* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b).
|
||||
* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, M3DB, Cortex, InfluxDB or TimescaleDB.
|
||||
See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae)
|
||||
and [comparing Thanos to VictoriaMetrics](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
|
||||
* Easy operation:
|
||||
* VictoriaMetrics consists of a single executable without external dependencies.
|
||||
* All the configuration is done via explicit command-line flags with reasonable defaults.
|
||||
* All the data is stored in a single directory pointed by `-storageDataPath` flag.
|
||||
* Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
* Supports metrics' ingestion and backfilling via the following protocols:
|
||||
* [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
|
||||
* [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
|
||||
* [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon)
|
||||
if `-graphiteListenAddr` is set.
|
||||
* [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set.
|
||||
* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors.
|
||||
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
|
||||
### Which clients do you target?
|
||||
|
||||
The following Prometheus users may be interested in VictoriaMetrics:
|
||||
- Users who don't want to bother with Prometheus' local storage operational burden - backups, replication, capacity planning, scalability, etc.
|
||||
- Users with multiple Prometheus instances who want performing arbitrary queries over all the metrics collected by their Prometheus instances (aka `global querying view`).
|
||||
- Users who want reducing costs for storing huge amounts of time series data.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#prominent-features).
|
||||
|
||||
|
||||
### How to start using VictoriaMetrics?
|
||||
|
||||
Start with [single-node version](Single-server-VictoriaMetrics). It is easy to configure and operate. It should fit the majority of use cases.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Quick-Start).
|
||||
|
||||
|
||||
### Is it safe to enable [remote write storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
|
||||
### What is the difference between vmagent and Prometheus?
|
||||
|
||||
Yes. Prometheus continues writing data to local storage after enabling remote storage write, so all the existing local storage data
|
||||
While both [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Prometheus may scrape Prometheus targets (aka `/metrics` pages)
|
||||
according to the provided Prometheus-compatible [scrape configs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
|
||||
and send data to multiple remote storage systems, vmagent has the following additional features:
|
||||
|
||||
- vmagent usually requires lower amounts of CPU, RAM and disk IO comparing to Prometheus when scraping big number of targets (more than 1000)
|
||||
or targets with big number of exposed metrics.
|
||||
- vmagent provides independent disk-backed buffers per each configured remote storage (aka `-remoteWrite.url`). This means that slow or temporarily unavailable storage
|
||||
doesn't prevent from sending data to healthy storage in parallel. Prometheus uses a single shared buffer for all the configured remote storage systems (aka `remote_write->url`)
|
||||
with the hardcoded retention of 2 hours.
|
||||
- vmagent may accept, relabel and filter data obtained via multiple data ingestion protocols additionally to data scraped from Prometheus targets.
|
||||
I.e. it supports both `pull` and `push` protocols for data ingestion.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#features) for details.
|
||||
- vmagent may be used in different use cases:
|
||||
- [IoT and edge monitoring](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#iot-and-edge-monitoring)
|
||||
- [Drop-in replacement for Prometheus](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#drop-in-replacement-for-prometheus)
|
||||
- [Replication and High Availability](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#replication-and-high-availability)
|
||||
- [Relabeling and Filtering](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#relabeling-and-filtering)
|
||||
- [Splitting data streams among multiple systems](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#splitting-data-streams-among-multiple-systems)
|
||||
- [Prometheus remote_write proxy](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#prometheus-remote_write-proxy)
|
||||
|
||||
|
||||
### Is it safe to enable [remote write](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
|
||||
|
||||
Yes. Prometheus continues writing data to local storage after enabling remote write, so all the existing local storage data
|
||||
and new data is available for querying via Prometheus as usual.
|
||||
|
||||
It is recommended using [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) for scraping Prometheus targets
|
||||
and writing data to VictoriaMetrics.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to other remote storage solutions for Prometheus such as [M3 from Uber](https://eng.uber.com/m3/), [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex), etc.?
|
||||
|
||||
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL with useful extensions for PromQL](MetricsQL). The simplicity is twofold:
|
||||
- It is simpler to configure and operate. There is no need in configuring third-party [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md)
|
||||
or fighting with [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
|
||||
- VictoriaMetrics has simpler architecture, which means less bugs and more useful features in the long run comparing to competing TSDBs.
|
||||
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL query language](MetricsQL) based on PromQL. The simplicity is twofold:
|
||||
- It is simpler to configure and operate. There is no need in configuring [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md),
|
||||
fighting [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md)
|
||||
or setting up third-party systems such as [Consul](https://github.com/cortexproject/cortex/issues/157), [Cassandra](https://cortexmetrics.io/docs/production/cassandra/),
|
||||
[DynamoDB](https://cortexmetrics.io/docs/production/aws/) or [Memcached](https://cortexmetrics.io/docs/production/caching/).
|
||||
- VictoriaMetrics has simpler architecture. This means less bugs and more useful features in the long run comparing to competing TSDBs.
|
||||
|
||||
See [comparing Thanos to VictoriaMetrics cluster](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683)
|
||||
and [Remote Write Storage Wars](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) talk from [PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
@@ -70,55 +70,68 @@ VictoriaMetrics also [uses less RAM than Thanos components](https://github.com/t
|
||||
### What is the difference between VictoriaMetrics and [Cortex](https://github.com/cortexproject/cortex)?
|
||||
|
||||
VictoriaMetrics is similar to Cortex in the following aspects:
|
||||
- Both systems accept data from Prometheus via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/),
|
||||
i.e. there is no need in running sidecars unlike in [Thanos](https://github.com/thanos-io/thanos) case.
|
||||
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format).
|
||||
- Both systems accept data from [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or Prometheus
|
||||
via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/), i.e. there is no need in running sidecars
|
||||
unlike in [Thanos](https://github.com/thanos-io/thanos) case.
|
||||
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#multitenancy).
|
||||
- Both systems support data replication. See [replication in Cortex](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#hashing) and [replication in VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety).
|
||||
- Both systems scale horizontally to multiple nodes. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#cluster-resizing-and-scalability) for details.
|
||||
- Both systems support alerting and recording rules via the corresponding tools such as [vmalert](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md).
|
||||
|
||||
|
||||
The main differences between Cortex and VictoriaMetrics:
|
||||
- Cortex re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
|
||||
- Cortex provides [Ruler](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ruler) and [Alertmanager](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#alertmanager) components,
|
||||
which are currently missing in VictoriaMetrics. However, these components can be substituted by [Promxy](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
- Cortex heavily relies on third-party services such as Consul, Memcache, DynamoDB, BigTable, Cassandra, etc.
|
||||
This may increase operational complexity and reduce system reliability comparing to VictoriaMetrics' case,
|
||||
which doesn't use any external services. Compare [Cortex Architecture](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md)
|
||||
to [VictoriaMetrics architecture](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#architecture-overview).
|
||||
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
|
||||
which is much easier to setup and operate than Cortex cluster.
|
||||
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ingesters-failure-and-data-loss).
|
||||
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#ingesters-failure-and-data-loss).
|
||||
VictoriaMetrics may lose only a few seconds of recent data, which isn't synced to persistent storage yet.
|
||||
See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
|
||||
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) and [other case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### What is the difference between VictoriaMetrics and [Thanos](https://github.com/thanos-io/thanos)?
|
||||
|
||||
- Thanos re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
|
||||
- Thanos provides [Ruler component](https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md),
|
||||
while VictoriaMetrics relies on [Promxy for alerting and recording rules](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
- VictoriaMetrics accepts data via [standard remote_write API for Prometheus](https://prometheus.io/docs/practices/remote_write/),
|
||||
while Thanos uses non-standard [Sidecar](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md), which must run alongside each Prometheus instance.
|
||||
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage.
|
||||
- Thanos stores data on object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data on block storage (GCP persistent disks, Amazon EBS or bare metal HDD).
|
||||
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage. See [these docs](https://thanos.io/components/sidecar.md/) for more details.
|
||||
- Thanos stores data in object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data in block storage
|
||||
([GCP persistent disks](https://cloud.google.com/compute/docs/disks#pdspecs), Amazon EBS or bare metal HDD).
|
||||
While object storage is usually less expensive, block storage provides much lower latencies and higher throughput.
|
||||
VictoriaMetrics works perfectly with HDD-based block storage - there is no need in using more expensive SSD or NVMe disks in most cases.
|
||||
- Thanos may lose up to 2 hours of recent data, which wasn't uploaded yet to object storage. VictoriaMetrics may lose only a few seconds of recent data,
|
||||
which isn't synced to persistent storage yet. See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
|
||||
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
|
||||
which is much easier to setup and operate than Thanos components.
|
||||
- Thanos may be harder to setup and operate comparing to VictoriaMetrics, since it has more moving parts, which can be connected with less reliable networks.
|
||||
See [this article for details](https://medium.com/faun/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
|
||||
- Thanos is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to [InfluxDB](https://www.influxdata.com/time-series-platform/influxdb/)?
|
||||
|
||||
VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
It is easier to configure and operate. It provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
|
||||
- VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
- VictoriaMetrics provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to InfluxDB - Prometheus remote_write, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to [TimescaleDB](https://www.timescale.com/)?
|
||||
|
||||
TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
|
||||
Additionally, VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data.
|
||||
- TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
|
||||
- VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data. The gap in storage space usage can be lowered from 70x to 3x if [compression in TimescaleDB is properly configured](https://docs.timescale.com/latest/using-timescaledb/compression) (it isn't an easy task in general case :)).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols - InfluxDB, OpenTSDB, Graphite, CSV, while TimescaleDB supports only SQL inserts.
|
||||
|
||||
|
||||
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex)?
|
||||
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos) or [Cortex](https://github.com/cortexproject/cortex)?
|
||||
|
||||
No. VictoriaMetrics core is written in Go from scratch by [fasthttp](https://github.com/valyala/fasthttp) [author](https://github.com/valyala).
|
||||
The architecture is [optimized for storing and querying large amounts of time series data with high cardinality](https://medium.com/devopslinks/victoriametrics-creating-the-best-remote-storage-for-prometheus-5d92d66787ac). VictoriaMetrics storage uses [certain ideas from ClickHouse](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282). Special thanks to [Alexey Milovidov](https://github.com/alexey-milovidov).
|
||||
@@ -151,7 +164,7 @@ The following commercial versions of VictoriaMetrics are planned:
|
||||
* Managed cluster in the Cloud.
|
||||
* SaaS version.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) for more information and for the pricing.
|
||||
[Contact us](mailto:info@victoriametrics.com) for more information on our plans.
|
||||
|
||||
|
||||
### Why VictoriaMetrics doesn't support [Prometheus remote read API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#%3Cremote_read%3E)?
|
||||
|
||||
74
docs/SampleSizeCalculations.md
Normal file
74
docs/SampleSizeCalculations.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Sample size calculations
|
||||
|
||||
These calculations are for the “Lowest sample size” graph at https://victoriametrics.com/ .
|
||||
|
||||
How many metrics can be stored in 2tb disk for 2 years?
|
||||
|
||||
Seconds in 2 years:
|
||||
2 years * 365 days * 24 hours * 60 minutes * 60 seconds = 63072000 seconds
|
||||
|
||||
Resolution = 1 point per 10 second
|
||||
|
||||
That means each metric will contain 6307200 points.
|
||||
|
||||
2tb disk contains
|
||||
2 (tb) * 1024 (gb) * 1024 (mb) * 1024 (kb) * 1024 (b) = 2199023255552 bytes
|
||||
|
||||
# VictoriaMetrics
|
||||
Based on production data from our customers, sample size is 0.4 byte
|
||||
That means one metric with 10 seconds resolution will need
|
||||
6307200 points * 0.4 bytes/point = 2522880 bytes or 2.4 megabytes.
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
2199023255552 (disk size) / 2522880 (one metric for 2 year) = 871632 metrics
|
||||
So in 2tb we can store 871 632 metrics
|
||||
|
||||
# Graphite
|
||||
Based on https://m30m.github.io/whisper-calculator/ sample size of graphite metrics is 12b + 28b for each metric
|
||||
That means, one metric with 10 second resolution will need 75686428 bytes or 72.18 megabytes
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
2199023255552 / 75686428 = 29 054 metrics
|
||||
|
||||
# OpenTSDB
|
||||
Let's check official openTSDB site
|
||||
http://opentsdb.net/faq.html
|
||||
16 bytes of HBase overhead, 3 bytes for the metric, 4 bytes for the timestamp, 6 bytes per tag, 2 bytes of OpenTSDB overhead, up to 8 bytes for the value. Integers are stored with variable length encoding and can consume 1, 2, 4 or 8 bytes.
|
||||
That means, one metric with 10 second resolution will need
|
||||
6307200 * (1 + 4) + 3 + 16 + 2 = 31536021 bytes or 30 megabytes in the best scenario and
|
||||
6307200 * (8 + 4) + 3 + 16 + 2 = 75686421 bytes or 72 megabytes in the worst scenario.
|
||||
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
|
||||
2199023255552 / 31536021 = 69 730 metrics for best scenario
|
||||
2199023255552 / 75686421 = 29 054 metrics for worst scenario
|
||||
|
||||
Also, openTSDB allows to use compression
|
||||
" LZO is able to achieve a compression factor of 4.2x "
|
||||
So, let's multiply numbers on 4.2
|
||||
69 730 * 4,2 = 292 866 metrics for best scenario
|
||||
29 054 * 4,2 = 122 026 metrics for worst scenario
|
||||
# m3db
|
||||
Let's look at official m3db site https://m3db.github.io/m3/m3db/architecture/engine/
|
||||
They can achieve a sample size of 1.45 bytes/datapoint
|
||||
That means, one metric with 10 second resolution will need 9145440 bytes or 8,72177124 megabytes
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
2199023255552 / 9145440 = 240 450 metrics
|
||||
|
||||
# InfluxDB
|
||||
Based on official influxDB site https://docs.influxdata.com/influxdb/v1.8/guides/hardware_sizing/#bytes-and-compression
|
||||
"Non-string values require approximately three bytes". That means, one metric with 10 second resolution will need
|
||||
6307200 * 3 = 18921600 bytes or 18 megabytes
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
|
||||
2199023255552 / 18921600 = 116 217 metrics
|
||||
|
||||
# Prometheus
|
||||
Let's check official site: https://prometheus.io/docs/prometheus/latest/storage/
|
||||
"On average, Prometheus uses only around 1-2 bytes per sample."
|
||||
That means, one metric with 10 second resolution will need
|
||||
6307200 * 1 = 6307200 bytes in best scenario
|
||||
6307200 * 2 = 12614400 bytes in worst scenario.
|
||||
|
||||
Calculation for number of metrics can be stored in 2 tb disk:
|
||||
|
||||
2199023255552 / 6307200 = 348 652 metrics for the best case
|
||||
2199023255552 / 12614400 = 174 326 metrics for the worst cases
|
||||
@@ -21,6 +21,7 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
@@ -149,9 +150,9 @@ The following command-line flags are used the most:
|
||||
|
||||
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
|
||||
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
|
||||
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this.
|
||||
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
|
||||
|
||||
Pass `-help` to see all the available flags with description and default values.
|
||||
|
||||
@@ -582,17 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
|
||||
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
|
||||
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
|
||||
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
|
||||
|
||||
### Setting up service
|
||||
@@ -774,7 +776,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -834,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
|
||||
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
|
||||
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
|
||||
|
||||
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
|
||||
so it could route requests from particular user to VictoriaMetrics with the desired retention.
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
|
||||
|
||||
### Downsampling
|
||||
|
||||
There is no downsampling support at the moment, but:
|
||||
@@ -981,6 +994,10 @@ The most interesting metrics are:
|
||||
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
|
||||
while `topN` equals to 10.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
|
||||
|
||||
|
||||
### Backfilling
|
||||
|
||||
@@ -1000,15 +1017,12 @@ for data with timestamps close to the current time.
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
|
||||
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
|
||||
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
|
||||
See [backup docs](#backups) for details.
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Cluster version of VictoriaMetrics supports replication. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety)
|
||||
for details.
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
@@ -170,6 +170,8 @@ Additionally it provides the following extra actions:
|
||||
|
||||
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
|
||||
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
@@ -210,6 +212,14 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
|
||||
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
|
||||
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
|
||||
### How to build from sources
|
||||
|
||||
@@ -234,11 +244,11 @@ Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmagent
|
||||
ROOT_IMAGE=scratch make package-vmagent
|
||||
```
|
||||
|
||||
|
||||
|
||||
192
docs/vmalert.md
192
docs/vmalert.md
@@ -1,19 +1,27 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
`vmalert` executes a list of given MetricsQL expressions (rules) and
|
||||
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
expressions validation;
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### TODO:
|
||||
* Support recording rules.
|
||||
### Limitations:
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
|
||||
may fail;
|
||||
* by default, rules execution is sequential within one group, but persisting of execution results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one;
|
||||
* there is no `query` function support in templates yet;
|
||||
* `vmalert` has no UI, just an API for getting groups and rules statuses.
|
||||
|
||||
### QuickStart
|
||||
|
||||
@@ -26,10 +34,12 @@ make vmalert
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of alert rules - PromQL/MetricsQL expressions to execute;
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable Alertmanager instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
@@ -38,22 +48,106 @@ Then configure `vmalert` accordingly:
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
```
|
||||
|
||||
`vmalert` runs evaluation for every group in a separate goroutine.
|
||||
Rules in group evaluated one-by-one sequentially.
|
||||
#### Groups
|
||||
|
||||
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
Each group has following attributes:
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# How many rules execute at once. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
||||
#### Rules
|
||||
|
||||
There are two types of Rules:
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allow you to precompute frequently needed or computationally expensive expressions
|
||||
and save their result as a new set of time series.
|
||||
|
||||
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
|
||||
within one group.
|
||||
|
||||
##### Alerting rules
|
||||
|
||||
The syntax for alerting rule is following:
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Alerts are considered firing once they have been returned for this long.
|
||||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
labels:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
|
||||
# Annotations to add to each alert.
|
||||
annotations:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
|
||||
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
|
||||
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
|
||||
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
|
||||
address in form of timeseries with name `ALERTS` via remote-write protocol.
|
||||
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
|
||||
address by querying `ALERTS` timeseries.
|
||||
|
||||
|
||||
##### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
```yaml
|
||||
# The name of the time series to output to. Must be a valid metric name.
|
||||
record: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Labels to add or overwrite before storing the result.
|
||||
labels:
|
||||
[ <labelname>: <labelvalue> ]
|
||||
```
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must specified.
|
||||
|
||||
|
||||
#### WEB
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
|
||||
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
|
||||
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
|
||||
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -64,22 +158,36 @@ Usage of vmalert:
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
|
||||
-datasource.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
|
||||
-datasource.tlsServerName value
|
||||
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules. Default 1m (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
|
||||
-notifier.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
|
||||
-notifier.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
-notifier.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
|
||||
-notifier.tlsServerName value
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteRead.basicAuth.password string
|
||||
@@ -88,14 +196,40 @@ Usage of vmalert:
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteRead.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
|
||||
-remoteRead.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.maxQueueSize
|
||||
Defines the max number of pending datapoints to remote write endpoint
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
|
||||
-remoteWrite.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
@@ -105,8 +239,10 @@ Usage of vmalert:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Indicates to validate annotation and label templates (default true)
|
||||
Whether to validate annotation and label templates (default true)
|
||||
```
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
|
||||
@@ -23,7 +23,7 @@ Docker images for `vmauth` are available [here](https://hub.docker.com/r/victori
|
||||
|
||||
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
|
||||
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, accounting, limits, etc.
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
|
||||
|
||||
|
||||
### Auth config
|
||||
@@ -110,11 +110,11 @@ Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker i
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmauth
|
||||
ROOT_IMAGE=scratch make package-vmauth
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -89,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
|
||||
|
||||
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
|
||||
|
||||
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
|
||||
|
||||
|
||||
### How does it work?
|
||||
|
||||
@@ -121,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
|
||||
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
@@ -197,9 +201,9 @@ Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` dock
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmbackup
|
||||
ROOT_IMAGE=scratch make package-vmbackup
|
||||
```
|
||||
|
||||
@@ -98,9 +98,9 @@ Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` do
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmrestore
|
||||
ROOT_IMAGE=scratch make package-vmrestore
|
||||
```
|
||||
|
||||
24
go.mod
24
go.mod
@@ -1,7 +1,8 @@
|
||||
module github.com/VictoriaMetrics/VictoriaMetrics
|
||||
|
||||
require (
|
||||
cloud.google.com/go/storage v1.8.0
|
||||
cloud.google.com/go v0.60.0 // indirect
|
||||
cloud.google.com/go/storage v1.10.0
|
||||
github.com/VictoriaMetrics/fastcache v1.5.7
|
||||
|
||||
// Do not use the original github.com/valyala/fasthttp because of issues
|
||||
@@ -9,25 +10,24 @@ require (
|
||||
github.com/VictoriaMetrics/fasthttp v1.0.1
|
||||
github.com/VictoriaMetrics/metrics v1.11.3
|
||||
github.com/VictoriaMetrics/metricsql v0.2.3
|
||||
github.com/aws/aws-sdk-go v1.31.5
|
||||
github.com/aws/aws-sdk-go v1.32.13
|
||||
github.com/cespare/xxhash/v2 v2.1.1
|
||||
github.com/golang/protobuf v1.4.2 // indirect
|
||||
github.com/golang/snappy v0.0.1
|
||||
github.com/klauspost/compress v1.10.6
|
||||
github.com/valyala/fastjson v1.5.1
|
||||
github.com/klauspost/compress v1.10.10
|
||||
github.com/valyala/fastjson v1.5.3
|
||||
github.com/valyala/fastrand v1.0.0
|
||||
github.com/valyala/gozstd v1.7.0
|
||||
github.com/valyala/histogram v1.0.1
|
||||
github.com/valyala/quicktemplate v1.5.0
|
||||
golang.org/x/mod v0.3.0 // indirect
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 // indirect
|
||||
go.opencensus.io v0.22.4 // indirect
|
||||
golang.org/x/net v0.0.0-20200625001655-4c5254603344 // indirect
|
||||
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 // indirect
|
||||
google.golang.org/api v0.25.0
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece // indirect
|
||||
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae
|
||||
golang.org/x/text v0.3.3 // indirect
|
||||
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632 // indirect
|
||||
google.golang.org/api v0.28.0
|
||||
google.golang.org/grpc v1.30.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.3.0
|
||||
honnef.co/go/tools v0.0.1-2020.1.4 // indirect
|
||||
)
|
||||
|
||||
go 1.13
|
||||
|
||||
55
go.sum
55
go.sum
@@ -13,6 +13,8 @@ cloud.google.com/go v0.56.0 h1:WRz29PgAsVEyPSDHyk+0fpEkwEFyfhHn+JbksT6gIL4=
|
||||
cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
|
||||
cloud.google.com/go v0.57.0 h1:EpMNVUorLiZIELdMZbCYX/ByTFCdoYopYAGxaGVz9ms=
|
||||
cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
|
||||
cloud.google.com/go v0.60.0 h1:R+tDlceO7Ss+zyvtsdhTxacDyZ1k99xwskQ4FT7ruoM=
|
||||
cloud.google.com/go v0.60.0/go.mod h1:yw2G51M9IfRboUH61Us8GqCeF1PzPblB823Mn2q2eAU=
|
||||
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
|
||||
cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
|
||||
cloud.google.com/go/bigquery v1.4.0 h1:xE3CPsOgttP4ACBePh79zTKALtXwn/Edhcr16R5hMWU=
|
||||
@@ -20,6 +22,8 @@ cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvf
|
||||
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
|
||||
cloud.google.com/go/bigquery v1.7.0 h1:a/O/bK/vWrYGOTFtH8di4rBxMZnmkjy+Y5LxpDwo+dA=
|
||||
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
|
||||
cloud.google.com/go/bigquery v1.8.0 h1:PQcPefKFdaIzjQFbiyOgAqyx8q5djaE7x9Sqe712DPA=
|
||||
cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
|
||||
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
|
||||
cloud.google.com/go/datastore v1.1.0 h1:/May9ojXjRkPBNVrq+oWLqmWCkr4OU5uRY29bu0mRyQ=
|
||||
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
|
||||
@@ -35,6 +39,8 @@ cloud.google.com/go/storage v1.6.0 h1:UDpwYIwla4jHGzZJaEJYx1tOejbgSoNqsAfHAUYe2r
|
||||
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
|
||||
cloud.google.com/go/storage v1.8.0 h1:86K1Gel7BQ9/WmNWn7dTKMvTLFzwtBe5FNqYbi9X35g=
|
||||
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
|
||||
cloud.google.com/go/storage v1.10.0 h1:STgFzyU5/8miMl0//zKh2aQeTyeaUH3WN9bSUiJ09bA=
|
||||
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
|
||||
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
@@ -51,8 +57,8 @@ github.com/VictoriaMetrics/metricsql v0.2.3 h1:xGscDmLoeIV7+8qX/mdHnOY0vu4m+wHIV
|
||||
github.com/VictoriaMetrics/metricsql v0.2.3/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
|
||||
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
|
||||
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
|
||||
github.com/aws/aws-sdk-go v1.31.5 h1:DFA7BzTydO4etqsTja+x7UfkOKQUv1xzEluLvNk81L0=
|
||||
github.com/aws/aws-sdk-go v1.31.5/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
|
||||
github.com/aws/aws-sdk-go v1.32.13 h1:zzyXF7SUxJcJa3hTcYCl1/Ey+kh2N8TjK5tWnL0wieo=
|
||||
github.com/aws/aws-sdk-go v1.32.13/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
|
||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
@@ -108,6 +114,10 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
|
||||
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.4.1 h1:/exdXoGamhu5ONeUJH0deniYLWYvQwW66yvlfiiKTu0=
|
||||
github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
|
||||
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no=
|
||||
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
|
||||
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
|
||||
@@ -116,6 +126,7 @@ github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hf
|
||||
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
|
||||
github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
|
||||
github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
|
||||
github.com/google/pprof v0.0.0-20200507031123-427632fa3b1c/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
|
||||
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
|
||||
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
|
||||
github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
|
||||
@@ -132,8 +143,8 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
|
||||
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.5 h1:7q6vHIqubShURwQz8cQK6yIe/xC3IF0Vm7TGfqjewrc=
|
||||
github.com/klauspost/compress v1.10.5/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.6 h1:SP6zavvTG3YjOosWePXFDlExpKIWMTO4SE/Y8MZB2vI=
|
||||
github.com/klauspost/compress v1.10.6/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.10 h1:a/y8CglcM7gLGYmlbP/stPE5sR3hbhFRUjCBfd/0B3I=
|
||||
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
@@ -152,8 +163,8 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||
github.com/valyala/fasthttp v1.12.0/go.mod h1:229t1eWu9UXTPmoUkbpN/fctKPBY4IJoFXQnxHGXy6E=
|
||||
github.com/valyala/fastjson v1.5.1 h1:SXaQZVSwLjZOVhDEhjiCcDtnX0Feu7Z7A1+C5atpoHM=
|
||||
github.com/valyala/fastjson v1.5.1/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
|
||||
github.com/valyala/fastjson v1.5.3 h1:z4Z1Bll4WaXo+FXJoiCdW8ss7sKY2d/jYfE2ZzoT284=
|
||||
github.com/valyala/fastjson v1.5.3/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
|
||||
github.com/valyala/fastrand v1.0.0 h1:LUKT9aKer2dVQNUi3waewTbKV+7H17kvWFNKs2ObdkI=
|
||||
github.com/valyala/fastrand v1.0.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ=
|
||||
github.com/valyala/gozstd v1.7.0 h1:Ljh5c9zboqLhwTI33al32R72iCZfn0mCbVGcFWbGwRQ=
|
||||
@@ -170,10 +181,13 @@ go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
|
||||
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
|
||||
go.opencensus.io v0.22.3 h1:8sGtKOrtQqkN1bp2AtX+misvLIlOmsEsNd+9NIcPEm8=
|
||||
go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
|
||||
go.opencensus.io v0.22.4 h1:LYy1Hy3MJdrCdMwwzxA/dRok4ejH+RwNGbuoD9fCjto=
|
||||
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
|
||||
@@ -229,8 +243,11 @@ golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 h1:WQ8q63x+f/zpC8Ac1s9wLElVo
|
||||
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y=
|
||||
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 h1:eDrdRpKgkcCqKZQwyZRyeFZgfqt37SL7Kv3tok06cKE=
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200625001655-4c5254603344 h1:vGXIOMxbNfDTk/aXCmfdLgkrSV+Z2tcbze+pEc3v5W4=
|
||||
golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
@@ -268,13 +285,18 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w
|
||||
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25 h1:OKbAoGs4fGM5cPLlVQLZGYkFC8OnOfgo6tt0Smf9XhM=
|
||||
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121 h1:rITEj+UZHYC927n8GT97eC3zrpzXdb/voyeOuVKS46o=
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae h1:Ih9Yo4hSPImZOpfGuA4bR/ORKTAbhZo2AbWNRCnevdo=
|
||||
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
@@ -313,8 +335,11 @@ golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjs
|
||||
golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
|
||||
golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 h1:3KBjmg2slvQXATWW9cQJ6tsRc8hj1gsnwWyi1IzYk3o=
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200626171337-aa94e735be7f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632 h1:fxWeLV4ol1icb47+btwfzuf8gOkJE/PPo9bLjSvZzA8=
|
||||
golang.org/x/tools v0.0.0-20200630154851-b2d8b0336632/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
|
||||
@@ -333,8 +358,8 @@ google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/
|
||||
google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.24.0 h1:cG03eaksBzhfSIk7JRGctfp3lanklcOM/mTGvow7BbQ=
|
||||
google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
|
||||
google.golang.org/api v0.25.0 h1:LodzhlzZEUfhXzNUMIfVlf9Gr6Ua5MMtoFWh7+f47qA=
|
||||
google.golang.org/api v0.25.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
|
||||
google.golang.org/api v0.28.0 h1:jMF5hhVfMkTZwHW1SDpKq5CkgWLXOb31Foaca9Zr3oM=
|
||||
google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
|
||||
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
|
||||
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
@@ -367,9 +392,11 @@ google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84 h1:pSLkPbrjnPyLDYU
|
||||
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380 h1:xriR1EgvKfkKxIoU2uUvrMVl+H26359loFFUleSMXFo=
|
||||
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U=
|
||||
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece h1:1YM0uhfumvoDu9sx8+RyWwTI63zoCQvI23IYFRlvte0=
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
|
||||
google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
|
||||
google.golang.org/genproto v0.0.0-20200626011028-ee7919e894b5 h1:a/Sqq5B3dGnmxhuJZIHFsIxhEkqElErr5TaU6IqBAj0=
|
||||
google.golang.org/genproto v0.0.0-20200626011028-ee7919e894b5/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
|
||||
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
||||
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
|
||||
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
|
||||
@@ -381,6 +408,8 @@ google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8
|
||||
google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
|
||||
google.golang.org/grpc v1.29.1 h1:EC2SB8S04d2r73uptxphDSUG+kTKVgjRPF+N3xpxRB4=
|
||||
google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
|
||||
google.golang.org/grpc v1.30.0 h1:M5a8xTlYTxwMn5ZFkwhRabsygDY5G8TYLyQDBxJNAxE=
|
||||
google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
|
||||
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
||||
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
|
||||
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
|
||||
@@ -394,6 +423,8 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2
|
||||
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
|
||||
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
|
||||
google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c=
|
||||
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
|
||||
@@ -55,13 +55,13 @@ func (b *Backup) Run() error {
|
||||
}
|
||||
|
||||
if err := dst.DeleteFile(fscommon.BackupCompleteFilename); err != nil {
|
||||
return fmt.Errorf("cannot delete `backup complete` file at %s: %s", dst, err)
|
||||
return fmt.Errorf("cannot delete `backup complete` file at %s: %w", dst, err)
|
||||
}
|
||||
if err := runBackup(src, dst, origin, concurrency); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := dst.CreateFile(fscommon.BackupCompleteFilename, []byte("ok")); err != nil {
|
||||
return fmt.Errorf("cannot create `backup complete` file at %s: %s", dst, err)
|
||||
return fmt.Errorf("cannot create `backup complete` file at %s: %w", dst, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -74,17 +74,17 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
|
||||
logger.Infof("obtaining list of parts at %s", src)
|
||||
srcParts, err := src.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list src parts: %s", err)
|
||||
return fmt.Errorf("cannot list src parts: %w", err)
|
||||
}
|
||||
logger.Infof("obtaining list of parts at %s", dst)
|
||||
dstParts, err := dst.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list dst parts: %s", err)
|
||||
return fmt.Errorf("cannot list dst parts: %w", err)
|
||||
}
|
||||
logger.Infof("obtaining list of parts at %s", origin)
|
||||
originParts, err := origin.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list origin parts: %s", err)
|
||||
return fmt.Errorf("cannot list origin parts: %w", err)
|
||||
}
|
||||
|
||||
backupSize := getPartsSize(srcParts)
|
||||
@@ -97,7 +97,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
|
||||
err = runParallel(concurrency, partsToDelete, func(p common.Part) error {
|
||||
logger.Infof("deleting %s from %s", &p, dst)
|
||||
if err := dst.DeletePart(p); err != nil {
|
||||
return fmt.Errorf("cannot delete %s from %s: %s", &p, dst, err)
|
||||
return fmt.Errorf("cannot delete %s from %s: %w", &p, dst, err)
|
||||
}
|
||||
atomic.AddUint64(&deletedParts, 1)
|
||||
return nil
|
||||
@@ -109,7 +109,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
|
||||
return err
|
||||
}
|
||||
if err := dst.RemoveEmptyDirs(); err != nil {
|
||||
return fmt.Errorf("cannot remove empty directories at %s: %s", dst, err)
|
||||
return fmt.Errorf("cannot remove empty directories at %s: %w", dst, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
|
||||
err = runParallel(concurrency, originCopyParts, func(p common.Part) error {
|
||||
logger.Infof("server-side copying %s from %s to %s", &p, origin, dst)
|
||||
if err := dst.CopyPart(origin, p); err != nil {
|
||||
return fmt.Errorf("cannot copy %s from %s to %s: %s", &p, origin, dst, err)
|
||||
return fmt.Errorf("cannot copy %s from %s to %s: %w", &p, origin, dst, err)
|
||||
}
|
||||
atomic.AddUint64(&copiedParts, 1)
|
||||
return nil
|
||||
@@ -144,17 +144,17 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con
|
||||
logger.Infof("uploading %s from %s to %s", &p, src, dst)
|
||||
rc, err := src.NewReadCloser(p)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot create reader for %s from %s: %s", &p, src, err)
|
||||
return fmt.Errorf("cannot create reader for %s from %s: %w", &p, src, err)
|
||||
}
|
||||
sr := &statReader{
|
||||
r: rc,
|
||||
bytesRead: &bytesUploaded,
|
||||
}
|
||||
if err := dst.UploadPart(p, sr); err != nil {
|
||||
return fmt.Errorf("cannot upload %s to %s: %s", &p, dst, err)
|
||||
return fmt.Errorf("cannot upload %s to %s: %w", &p, dst, err)
|
||||
}
|
||||
if err = rc.Close(); err != nil {
|
||||
return fmt.Errorf("cannot close reader for %s from %s: %s", &p, src, err)
|
||||
return fmt.Errorf("cannot close reader for %s from %s: %w", &p, src, err)
|
||||
}
|
||||
return nil
|
||||
}, func(elapsed time.Duration) {
|
||||
|
||||
@@ -43,11 +43,11 @@ func (r *Restore) Run() error {
|
||||
|
||||
// Make sure VictoriaMetrics doesn't run during the restore process.
|
||||
if err := fs.MkdirAllIfNotExist(r.Dst.Dir); err != nil {
|
||||
return fmt.Errorf("cannot create dir %q: %s", r.Dst.Dir, err)
|
||||
return fmt.Errorf("cannot create dir %q: %w", r.Dst.Dir, err)
|
||||
}
|
||||
flockF, err := fs.CreateFlockFile(r.Dst.Dir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot create lock file in %q; make sure VictoriaMetrics doesn't use the dir; error: %s", r.Dst.Dir, err)
|
||||
return fmt.Errorf("cannot create lock file in %q; make sure VictoriaMetrics doesn't use the dir; error: %w", r.Dst.Dir, err)
|
||||
}
|
||||
defer fs.MustClose(flockF)
|
||||
|
||||
@@ -71,12 +71,12 @@ func (r *Restore) Run() error {
|
||||
logger.Infof("obtaining list of parts at %s", src)
|
||||
srcParts, err := src.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list src parts: %s", err)
|
||||
return fmt.Errorf("cannot list src parts: %w", err)
|
||||
}
|
||||
logger.Infof("obtaining list of parts at %s", dst)
|
||||
dstParts, err := dst.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list dst parts: %s", err)
|
||||
return fmt.Errorf("cannot list dst parts: %w", err)
|
||||
}
|
||||
|
||||
backupSize := getPartsSize(srcParts)
|
||||
@@ -129,7 +129,7 @@ func (r *Restore) Run() error {
|
||||
logger.Infof("deleting %s from %s", path, dst)
|
||||
size, err := dst.DeletePath(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete %s from %s: %s", path, dst, err)
|
||||
return fmt.Errorf("cannot delete %s from %s: %w", path, dst, err)
|
||||
}
|
||||
deleteSize += size
|
||||
}
|
||||
@@ -137,14 +137,14 @@ func (r *Restore) Run() error {
|
||||
return err
|
||||
}
|
||||
if err := dst.RemoveEmptyDirs(); err != nil {
|
||||
return fmt.Errorf("cannot remove empty directories at %s: %s", dst, err)
|
||||
return fmt.Errorf("cannot remove empty directories at %s: %w", dst, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Re-read dstParts, since additional parts may be removed on the previous step.
|
||||
dstParts, err = dst.ListParts()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot list dst parts after the deletion: %s", err)
|
||||
return fmt.Errorf("cannot list dst parts after the deletion: %w", err)
|
||||
}
|
||||
|
||||
partsToCopy := common.PartsDifference(srcParts, dstParts)
|
||||
@@ -166,17 +166,17 @@ func (r *Restore) Run() error {
|
||||
logger.Infof("downloading %s from %s to %s", &p, src, dst)
|
||||
wc, err := dst.NewWriteCloser(p)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot create writer for %q to %s: %s", &p, dst, err)
|
||||
return fmt.Errorf("cannot create writer for %q to %s: %w", &p, dst, err)
|
||||
}
|
||||
sw := &statWriter{
|
||||
w: wc,
|
||||
bytesWritten: &bytesDownloaded,
|
||||
}
|
||||
if err := src.DownloadPart(p, sw); err != nil {
|
||||
return fmt.Errorf("cannot download %s to %s: %s", &p, dst, err)
|
||||
return fmt.Errorf("cannot download %s to %s: %w", &p, dst, err)
|
||||
}
|
||||
if err := wc.Close(); err != nil {
|
||||
return fmt.Errorf("cannot close reader from %s from %s: %s", &p, src, err)
|
||||
return fmt.Errorf("cannot close reader from %s from %s: %w", &p, src, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user