mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-08 19:33:35 +03:00
Compare commits
221 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8da3f773ae | ||
|
|
9d5f5b6878 | ||
|
|
9a2ba5b6d1 | ||
|
|
b277ba8121 | ||
|
|
84a37098ed | ||
|
|
56ccfa5218 | ||
|
|
7c2c8b2981 | ||
|
|
d5dddb0953 | ||
|
|
586c5be404 | ||
|
|
1cd01b5359 | ||
|
|
88538df267 | ||
|
|
63e5ee0d29 | ||
|
|
eba4e92994 | ||
|
|
82ecfa3b32 | ||
|
|
dc4e3f0e0b | ||
|
|
8f2e88234f | ||
|
|
423825695f | ||
|
|
5dc0bf6d3d | ||
|
|
7eb171182b | ||
|
|
05d754d7bb | ||
|
|
8dec17470d | ||
|
|
5e35b87c3d | ||
|
|
c85d926569 | ||
|
|
f0cef4761b | ||
|
|
774f7ca1c1 | ||
|
|
a560b4788e | ||
|
|
8141541e61 | ||
|
|
e65b4cb6b1 | ||
|
|
7209d58fbd | ||
|
|
72c90bfd8b | ||
|
|
2a39ba639d | ||
|
|
8f0bcec6cc | ||
|
|
a13cd60c6f | ||
|
|
c970cb912c | ||
|
|
b5206ce33f | ||
|
|
4c7f216dfe | ||
|
|
530f7a21e8 | ||
|
|
7532dbcdf5 | ||
|
|
7ec6711f06 | ||
|
|
e149019c00 | ||
|
|
7bf2cbad32 | ||
|
|
6ff821c70d | ||
|
|
a43be95e83 | ||
|
|
f689164711 | ||
|
|
7976ec8bb1 | ||
|
|
9f3e3a4d7a | ||
|
|
57acbf5491 | ||
|
|
5820c0ffb7 | ||
|
|
ac3700ed1e | ||
|
|
b542e50680 | ||
|
|
818abca8f1 | ||
|
|
50c0d8c17d | ||
|
|
88e1b7d144 | ||
|
|
08495360b0 | ||
|
|
a12364ad37 | ||
|
|
e91d758831 | ||
|
|
3d63a79b91 | ||
|
|
e53ee763f9 | ||
|
|
ae1cc0fc4b | ||
|
|
e426434770 | ||
|
|
3e277020a5 | ||
|
|
ffa75c423d | ||
|
|
0bba630f55 | ||
|
|
2382053d32 | ||
|
|
69a647b0d2 | ||
|
|
f5dd2a71a6 | ||
|
|
4b98e436ef | ||
|
|
4e8d6b80e0 | ||
|
|
d120197676 | ||
|
|
4cb3af1a36 | ||
|
|
0d92abfbf6 | ||
|
|
ff1a725a56 | ||
|
|
05ae1472e3 | ||
|
|
4fd3f6f991 | ||
|
|
6281549f31 | ||
|
|
9f4e86ac2f | ||
|
|
af49c5bdf6 | ||
|
|
a47a05dfd2 | ||
|
|
3d4008263f | ||
|
|
72ff05255f | ||
|
|
a99d606220 | ||
|
|
f8692a1d43 | ||
|
|
78b28a03b6 | ||
|
|
854f40acf2 | ||
|
|
6d059b28bf | ||
|
|
f26b94cfb6 | ||
|
|
937338abdf | ||
|
|
f2b04f2efe | ||
|
|
ff624c9125 | ||
|
|
13b1358c07 | ||
|
|
f7ff809c1e | ||
|
|
ab6e994bab | ||
|
|
d2f30e8d79 | ||
|
|
270552fde4 | ||
|
|
32652485e3 | ||
|
|
d988f89415 | ||
|
|
3d5f48ec74 | ||
|
|
0ec43cb8b0 | ||
|
|
213c3903c9 | ||
|
|
a7797dae09 | ||
|
|
d186472081 | ||
|
|
7e2669f733 | ||
|
|
ff6d093e1b | ||
|
|
8311193293 | ||
|
|
80609fdf35 | ||
|
|
d7291487be | ||
|
|
f5a4731412 | ||
|
|
947009f459 | ||
|
|
4cf7238b73 | ||
|
|
c602284a99 | ||
|
|
2f35cf13c6 | ||
|
|
b4103e055a | ||
|
|
dde29c3c18 | ||
|
|
b3fcd726e3 | ||
|
|
5b6a9675d8 | ||
|
|
aa647637bf | ||
|
|
84860167d0 | ||
|
|
0101a2d7ca | ||
|
|
d2cab369b2 | ||
|
|
8905bc2a40 | ||
|
|
f9847352b4 | ||
|
|
d1a9d8aa1c | ||
|
|
b93b01bc6d | ||
|
|
dbd8beccfa | ||
|
|
6b23df2bec | ||
|
|
619d4959c7 | ||
|
|
70ea4e28a7 | ||
|
|
74a2943030 | ||
|
|
b3ec0fb5e2 | ||
|
|
9a3afea123 | ||
|
|
7705c19720 | ||
|
|
80c18b7275 | ||
|
|
cf87b810b7 | ||
|
|
538fdfe133 | ||
|
|
f52769f6ee | ||
|
|
e6a782498a | ||
|
|
a441cdd1d9 | ||
|
|
d0f08b4a58 | ||
|
|
aae8064285 | ||
|
|
7e173655ba | ||
|
|
92212f04da | ||
|
|
de60ad0cd6 | ||
|
|
7a8ef517ae | ||
|
|
d61bac9fd9 | ||
|
|
eac3da478e | ||
|
|
0890c780c2 | ||
|
|
fd1a6ce9ae | ||
|
|
9b90c841c6 | ||
|
|
93c87d28f6 | ||
|
|
23c55181ef | ||
|
|
b19ca3eb5f | ||
|
|
4e850cd6a7 | ||
|
|
8cb35974af | ||
|
|
a0380a0a91 | ||
|
|
3a68c47de0 | ||
|
|
697b6af10f | ||
|
|
93267f143f | ||
|
|
3412d5d138 | ||
|
|
27f7cca7ff | ||
|
|
cf38c3c62f | ||
|
|
c56b66210f | ||
|
|
82ffbcb9a6 | ||
|
|
82ccdfaa91 | ||
|
|
ab8f5545bc | ||
|
|
0eacea1de1 | ||
|
|
737d641920 | ||
|
|
4fc33163c4 | ||
|
|
f9f3afb6af | ||
|
|
8b32e7c3a0 | ||
|
|
1573ececb2 | ||
|
|
a249cd9d22 | ||
|
|
ef0e37cb9e | ||
|
|
0afd48d2ee | ||
|
|
42866fa754 | ||
|
|
827a3a7866 | ||
|
|
606585f7be | ||
|
|
21598ac417 | ||
|
|
894e5d2b9b | ||
|
|
415b1ddfb5 | ||
|
|
db7dd96346 | ||
|
|
ba48438b06 | ||
|
|
7882a0dbbf | ||
|
|
4fe67504f9 | ||
|
|
96e001d254 | ||
|
|
faf92a0965 | ||
|
|
a6f16dcc11 | ||
|
|
cc311e20fe | ||
|
|
574289c3fb | ||
|
|
0a134ace63 | ||
|
|
8300cc17af | ||
|
|
6273385618 | ||
|
|
3232605524 | ||
|
|
18834a9191 | ||
|
|
d90dc1fbf9 | ||
|
|
dbd0c552d5 | ||
|
|
cc00a2c453 | ||
|
|
ce2107bc52 | ||
|
|
12a1a71cc1 | ||
|
|
de113806bb | ||
|
|
8c8ff5d0cb | ||
|
|
9e8733ff65 | ||
|
|
5bc5d6a1f2 | ||
|
|
baedb25936 | ||
|
|
6909d845b0 | ||
|
|
efc8e3c523 | ||
|
|
51291015a5 | ||
|
|
099e44005b | ||
|
|
787fcfba0c | ||
|
|
6afb25fd08 | ||
|
|
653d51694a | ||
|
|
91a49eecea | ||
|
|
c4c447507d | ||
|
|
8a00807f60 | ||
|
|
b69eb7bf38 | ||
|
|
68928bf3df | ||
|
|
e8936c9cb3 | ||
|
|
3f52a97f9b | ||
|
|
364789c24c | ||
|
|
08320cfcf4 | ||
|
|
f65930b34d | ||
|
|
266327642b |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
run: |
|
||||
go get -u golang.org/x/lint/golint
|
||||
go get -u github.com/kisielk/errcheck
|
||||
go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.27.0
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
- name: Build
|
||||
|
||||
10
Makefile
10
Makefile
@@ -141,7 +141,15 @@ install-qtc:
|
||||
|
||||
|
||||
golangci-lint: install-golangci-lint
|
||||
golangci-lint run --exclude '(SA4003|SA1019):' -D errcheck -D structcheck --timeout 2m
|
||||
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
|
||||
docs-sync:
|
||||
cp app/vmagent/README.md docs/vmagent.md
|
||||
cp app/vmalert/README.md docs/vmalert.md
|
||||
cp app/vmauth/README.md docs/vmauth.md
|
||||
cp app/vmbackup/README.md docs/vmbackup.md
|
||||
cp app/vmrestore/README.md docs/vmrestore.md
|
||||
cp README.md docs/Single-server-VictoriaMetrics.md
|
||||
|
||||
95
README.md
95
README.md
@@ -10,17 +10,26 @@
|
||||
|
||||
## VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database.
|
||||
|
||||
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
|
||||
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
|
||||
|
||||
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
|
||||
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
|
||||
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
|
||||
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
|
||||
* [Zerodha](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#zerodha)
|
||||
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
|
||||
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
|
||||
* [Synthesio](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#synthesio)
|
||||
@@ -33,6 +42,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
|
||||
## Prominent features
|
||||
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
|
||||
See [these docs](#prometheus-setup) for details.
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
|
||||
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
|
||||
@@ -115,6 +126,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
* [Monitoring](#monitoring)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Backfilling](#backfilling)
|
||||
* [Replication](#replication)
|
||||
* [Backups](#backups)
|
||||
* [Profiling](#profiling)
|
||||
* [Integrations](#integrations)
|
||||
* [Third-party contributions](#third-party-contributions)
|
||||
@@ -137,7 +150,9 @@ The following command-line flags are used the most:
|
||||
|
||||
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
|
||||
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
|
||||
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this.
|
||||
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
|
||||
|
||||
Pass `-help` to see all the available flags with description and default values.
|
||||
|
||||
@@ -263,6 +278,7 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la
|
||||
* [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config)
|
||||
* [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config)
|
||||
* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
|
||||
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
|
||||
|
||||
In the future other `*_sd_config` types will be supported.
|
||||
|
||||
@@ -567,17 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
|
||||
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
|
||||
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
|
||||
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
|
||||
|
||||
### Setting up service
|
||||
@@ -759,7 +776,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -778,6 +801,8 @@ remote_write:
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
|
||||
|
||||
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
6) Set up Prometheus datasource in Grafana that points to Promxy.
|
||||
@@ -788,6 +813,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
|
||||
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
|
||||
with the enabled de-duplication. See [this section](#deduplication) for details.
|
||||
|
||||
|
||||
### Deduplication
|
||||
|
||||
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
|
||||
@@ -805,6 +831,8 @@ Data is split in per-month subdirectories inside `<-storageDataPath>/data/small`
|
||||
Directories for months outside the configured retention are deleted on the first day of new month.
|
||||
In order to keep data according to `-retentionPeriod` max disk space usage is going to be `-retentionPeriod` + 1 month.
|
||||
For example if `-retentionPeriod` is set to 1, data for January is deleted on March 1st.
|
||||
It is safe to extend `-retentionPeriod` on existing data. If `-retentionPeriod` is set to lower
|
||||
value than before then data outside the configured period will be eventually deleted.
|
||||
|
||||
### Multiple retentions
|
||||
|
||||
@@ -814,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
|
||||
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
|
||||
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
|
||||
|
||||
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
|
||||
so it could route requests from particular user to VictoriaMetrics with the desired retention.
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
|
||||
|
||||
### Downsampling
|
||||
|
||||
There is no downsampling support at the moment, but:
|
||||
@@ -870,6 +903,10 @@ Consider setting the following command-line flags:
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
|
||||
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md)
|
||||
or similar auth proxy.
|
||||
|
||||
|
||||
### Tuning
|
||||
|
||||
* There is no need for VictoriaMetrics tuning since it uses reasonable defaults for command-line flags,
|
||||
@@ -889,7 +926,8 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
|
||||
### Monitoring
|
||||
|
||||
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
|
||||
These metrics may be collected via Prometheus by adding the corresponding scrape config to it.
|
||||
These metrics may be collected by [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md)
|
||||
or Prometheus by adding the corresponding scrape config to it.
|
||||
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
|
||||
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
|
||||
|
||||
@@ -900,13 +938,17 @@ The most interesting metrics are:
|
||||
|
||||
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
|
||||
aka active time series.
|
||||
* `rate(vm_new_timeseries_created_total[5m])` - time series churn rate.
|
||||
* `vm_rows{type="indexdb"}` - the number of rows in inverted index. High value for this number usually mean high churn rate for time series.
|
||||
* Sum of `vm_rows{type="storage/big"}` and `vm_rows{type="storage/small"}` - total number of `(timestamp, value)` data points
|
||||
in the database.
|
||||
* `vm_rows_inserted_total` - the total number of inserted rows since VictoriaMetrics start.
|
||||
* `increase(vm_new_timeseries_created_total[1h])` - time series churn rate during the previous hour.
|
||||
* `sum(vm_rows{type=~"storage/.*"})` - total number of `(timestamp, value)` data points in the database.
|
||||
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
|
||||
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
|
||||
* `sum(vm_data_size_bytes)` - the total data size on disk.
|
||||
* `sum(vm_data_size_bytes)` - the total size of data on disk.
|
||||
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -919,8 +961,9 @@ The most interesting metrics are:
|
||||
|
||||
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
|
||||
then it is likely you have too many active time series for the current amount of RAM.
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
|
||||
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
ingestion performance.
|
||||
ingestion and query performance in this case.
|
||||
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
|
||||
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
|
||||
|
||||
@@ -951,6 +994,10 @@ The most interesting metrics are:
|
||||
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
|
||||
while `topN` equals to 10.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
|
||||
|
||||
|
||||
### Backfilling
|
||||
|
||||
@@ -967,6 +1014,24 @@ the query cache, which could contain incomplete data cached during the backfilli
|
||||
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
|
||||
for data with timestamps close to the current time.
|
||||
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
|
||||
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
|
||||
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
@@ -25,6 +26,8 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
@@ -52,8 +52,9 @@ publish-vmagent:
|
||||
APP_NAME=vmagent $(MAKE) publish-via-docker
|
||||
|
||||
run-vmagent:
|
||||
mkdir -p vmagent-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-data:/vmagent-data' \
|
||||
mkdir -p vmagent-remotewrite-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-remotewrite-data:/vmagent-remotewrite-data' \
|
||||
ARGS='-remoteWrite.url=http://localhost:8428/api/v1/write' \
|
||||
APP_NAME=vmagent \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
## vmagent
|
||||
|
||||
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
|
||||
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
|
||||
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
|
||||
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
@@ -11,7 +11,7 @@ or any other Prometheus-compatible storage system that supports `remote_write` p
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
|
||||
Also, we found that users’ infrastructure is like snowflakes - never alike, and we decided to add more flexibility
|
||||
Also, we found that users’ infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
|
||||
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
|
||||
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
|
||||
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
|
||||
|
||||
|
||||
### Quick Start
|
||||
@@ -40,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
|
||||
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
|
||||
in order to replicate data concurrently to multiple remote storage systems.
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
|
||||
@@ -49,7 +48,7 @@ Example command line:
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
If you need collecting only Influx data, then the following command line would be enough:
|
||||
If you only need to collect Influx data, then the following is sufficient:
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
@@ -57,7 +56,7 @@ If you need collecting only Influx data, then the following command line would b
|
||||
|
||||
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
|
||||
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/).
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
|
||||
|
||||
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
|
||||
|
||||
@@ -79,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
|
||||
#### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
#### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
|
||||
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
|
||||
|
||||
@@ -94,13 +93,13 @@ Then it sends the buffered data to the remote storage in order to prevent data g
|
||||
#### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
|
||||
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
See [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
#### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
@@ -133,22 +132,24 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
|
||||
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
|
||||
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
|
||||
* `ec2_sd_configs` - for scraping targets in Amazone EC2.
|
||||
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
|
||||
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
|
||||
`vmagent` doesn't support `role_arn` config param yet.
|
||||
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
|
||||
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
|
||||
`vmagent` provides the following additional functionality `gce_sd_config`:
|
||||
`vmagent` provides the following additional functionality for `gce_sd_config`:
|
||||
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
|
||||
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
|
||||
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
|
||||
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
|
||||
* `consul_sd_configs` - for scraping targets registered in Consul.
|
||||
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
|
||||
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
|
||||
|
||||
The following service discovery mechanisms will be added to `vmagent` soon:
|
||||
|
||||
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
|
||||
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
|
||||
|
||||
|
||||
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
@@ -169,12 +170,13 @@ Additionally it provides the following extra actions:
|
||||
|
||||
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
|
||||
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to targets when parsing the file during `vmagent` startup
|
||||
or during config reload after sending `SIGHUP` signal to `vmagent` via `kill -HUP`.
|
||||
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to metrics after each scrape for the configured targets.
|
||||
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
|
||||
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
|
||||
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
|
||||
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
|
||||
|
||||
@@ -203,12 +205,20 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
|
||||
|
||||
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported by `vmagent` at `/metrics` page constantly grows.
|
||||
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
|
||||
|
||||
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
|
||||
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
|
||||
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
|
||||
### How to build from sources
|
||||
@@ -234,9 +244,30 @@ Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmagent
|
||||
ROOT_IMAGE=scratch make package-vmagent
|
||||
```
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -39,6 +40,8 @@ var (
|
||||
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
|
||||
"Usually :4242 must be set. Doesn't work if empty")
|
||||
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
|
||||
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . See also -promscrape.config.dryRun")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -49,9 +52,27 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
if *dryRun {
|
||||
if err := flag.Set("promscrape.config.strictParse", "true"); err != nil {
|
||||
logger.Panicf("BUG: cannot set promscrape.config.strictParse=true: %s", err)
|
||||
}
|
||||
if err := remotewrite.CheckRelabelConfigs(); err != nil {
|
||||
logger.Fatalf("error when checking relabel configs: %s", err)
|
||||
}
|
||||
if err := promscrape.CheckConfig(); err != nil {
|
||||
logger.Fatalf("error when checking Prometheus config: %s", err)
|
||||
}
|
||||
logger.Infof("all the configs are ok; exitting with 0 status code")
|
||||
return
|
||||
}
|
||||
|
||||
logger.Infof("starting vmagent at %q...", *httpListenAddr)
|
||||
startTime := time.Now()
|
||||
remotewrite.Init()
|
||||
@@ -160,7 +181,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
case "/-/reload":
|
||||
promscrapeConfigReloadRequests.Inc()
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -185,3 +206,15 @@ var (
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
)
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -2,18 +2,17 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/base64"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/fasthttp"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
@@ -22,14 +21,21 @@ var (
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to -remoteWrite.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
|
||||
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
|
||||
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used")
|
||||
tlsCertFile = flagutil.NewArray("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsKeyFile = flagutil.NewArray("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsCAFile = flagutil.NewArray("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsServerName = flagutil.NewArray("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username to use for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password to use for -remoteWrite.url")
|
||||
bearerToken = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url")
|
||||
basicAuthUsername = flagutil.NewArray("remoteWrite.basicAuth.username", "Optional basic auth username to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
type client struct {
|
||||
@@ -50,19 +56,22 @@ type client struct {
|
||||
stopCh chan struct{}
|
||||
}
|
||||
|
||||
func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
authHeader := ""
|
||||
if len(*basicAuthUsername) > 0 || len(*basicAuthPassword) > 0 {
|
||||
username := basicAuthUsername.GetOptionalArg(argIdx)
|
||||
password := basicAuthPassword.GetOptionalArg(argIdx)
|
||||
if len(username) > 0 || len(password) > 0 {
|
||||
// See https://en.wikipedia.org/wiki/Basic_access_authentication
|
||||
token := *basicAuthUsername + ":" + *basicAuthPassword
|
||||
token := username + ":" + password
|
||||
token64 := base64.StdEncoding.EncodeToString([]byte(token))
|
||||
authHeader = "Basic " + token64
|
||||
}
|
||||
if len(*bearerToken) > 0 {
|
||||
token := bearerToken.GetOptionalArg(argIdx)
|
||||
if len(token) > 0 {
|
||||
if authHeader != "" {
|
||||
logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", *bearerToken)
|
||||
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
|
||||
}
|
||||
authHeader = "Bearer " + *bearerToken
|
||||
authHeader = "Bearer " + token
|
||||
}
|
||||
|
||||
readTimeout := *sendTimeout
|
||||
@@ -76,18 +85,18 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
|
||||
switch scheme {
|
||||
case "http", "https":
|
||||
default:
|
||||
logger.Panicf("FATAL: unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
logger.Fatalf("unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
}
|
||||
host := string(u.Host())
|
||||
if len(host) == 0 {
|
||||
logger.Panicf("FATAL: invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
logger.Fatalf("invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
}
|
||||
requestURI := string(u.RequestURI())
|
||||
isTLS := scheme == "https"
|
||||
var tlsCfg *tls.Config
|
||||
if isTLS {
|
||||
var err error
|
||||
tlsCfg, err = getTLSConfig()
|
||||
tlsCfg, err = getTLSConfig(argIdx)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
|
||||
}
|
||||
@@ -104,7 +113,6 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
|
||||
Addr: host,
|
||||
Name: "vmagent",
|
||||
Dial: statDial,
|
||||
DialDualStack: netutil.TCP6Enabled(),
|
||||
IsTLS: isTLS,
|
||||
TLSConfig: tlsCfg,
|
||||
MaxConns: maxConns,
|
||||
@@ -144,34 +152,19 @@ func (c *client) MustStop() {
|
||||
logger.Infof("stopped client for -remoteWrite.url=%q", c.remoteWriteURL)
|
||||
}
|
||||
|
||||
func getTLSConfig() (*tls.Config, error) {
|
||||
var tlsRootCA *x509.CertPool
|
||||
var tlsCertificate *tls.Certificate
|
||||
if *tlsCertFile != "" || *tlsKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(*tlsCertFile, *tlsKeyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", *tlsCertFile, *tlsKeyFile, err)
|
||||
}
|
||||
tlsCertificate = &cert
|
||||
func getTLSConfig(argIdx int) (*tls.Config, error) {
|
||||
tlsConfig := &promauth.TLSConfig{
|
||||
CAFile: tlsCAFile.GetOptionalArg(argIdx),
|
||||
CertFile: tlsCertFile.GetOptionalArg(argIdx),
|
||||
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
|
||||
ServerName: tlsServerName.GetOptionalArg(argIdx),
|
||||
InsecureSkipVerify: *tlsInsecureSkipVerify,
|
||||
}
|
||||
if *tlsCAFile != "" {
|
||||
data, err := ioutil.ReadFile(*tlsCAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", *tlsCAFile, err)
|
||||
}
|
||||
tlsRootCA = x509.NewCertPool()
|
||||
if !tlsRootCA.AppendCertsFromPEM(data) {
|
||||
return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", *tlsCAFile)
|
||||
}
|
||||
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
|
||||
}
|
||||
tlsCfg := &tls.Config{
|
||||
RootCAs: tlsRootCA,
|
||||
ClientSessionCache: tls.NewLRUClientSessionCache(0),
|
||||
}
|
||||
if tlsCertificate != nil {
|
||||
tlsCfg.Certificates = []tls.Certificate{*tlsCertificate}
|
||||
}
|
||||
tlsCfg.InsecureSkipVerify = *tlsInsecureSkipVerify
|
||||
tlsCfg := cfg.NewTLSConfig()
|
||||
return tlsCfg, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -15,7 +15,8 @@ import (
|
||||
|
||||
var (
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
|
||||
"Minimum supported interval is 1 second")
|
||||
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
|
||||
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
|
||||
)
|
||||
@@ -55,6 +56,10 @@ func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
|
||||
}
|
||||
|
||||
func (ps *pendingSeries) periodicFlusher() {
|
||||
flushSeconds := int64(flushInterval.Seconds())
|
||||
if flushSeconds <= 0 {
|
||||
flushSeconds = 1
|
||||
}
|
||||
ticker := time.NewTicker(*flushInterval)
|
||||
defer ticker.Stop()
|
||||
mustStop := false
|
||||
@@ -63,7 +68,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
case <-ps.stopCh:
|
||||
mustStop = true
|
||||
case <-ticker.C:
|
||||
if time.Since(ps.wr.lastFlushTime) < *flushInterval/2 {
|
||||
if fasttime.UnixTimestamp()-ps.wr.lastFlushTime < uint64(flushSeconds) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -76,7 +81,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
type writeRequest struct {
|
||||
wr prompbmarshal.WriteRequest
|
||||
pushBlock func(block []byte)
|
||||
lastFlushTime time.Time
|
||||
lastFlushTime uint64
|
||||
|
||||
tss []prompbmarshal.TimeSeries
|
||||
|
||||
@@ -108,7 +113,7 @@ func (wr *writeRequest) reset() {
|
||||
|
||||
func (wr *writeRequest) flush() {
|
||||
wr.wr.Timeseries = wr.tss
|
||||
wr.lastFlushTime = time.Now()
|
||||
wr.lastFlushTime = fasttime.UnixTimestamp()
|
||||
pushWriteRequest(&wr.wr, wr.pushBlock)
|
||||
wr.reset()
|
||||
}
|
||||
@@ -144,13 +149,8 @@ func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
|
||||
}
|
||||
dst.Labels = labelsDst[labelsLen:]
|
||||
|
||||
samplesDst = append(samplesDst, prompbmarshal.Sample{})
|
||||
dstSample := &samplesDst[len(samplesDst)-1]
|
||||
if len(src.Samples) != 1 {
|
||||
logger.Panicf("BUG: unexpected number of samples in time series; got %d; want 1", len(src.Samples))
|
||||
}
|
||||
*dstSample = src.Samples[0]
|
||||
dst.Samples = samplesDst[len(samplesDst)-1:]
|
||||
samplesDst = append(samplesDst, src.Samples...)
|
||||
dst.Samples = samplesDst[len(samplesDst)-len(src.Samples):]
|
||||
|
||||
wr.samples = samplesDst
|
||||
wr.labels = labelsDst
|
||||
|
||||
@@ -2,6 +2,7 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
@@ -16,35 +17,60 @@ var (
|
||||
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
|
||||
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
|
||||
"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
var labelsGlobal []prompbmarshal.Label
|
||||
var prcsGlobal []promrelabel.ParsedRelabelConfig
|
||||
|
||||
// initRelabelGlobal must be called after parsing command-line flags.
|
||||
func initRelabelGlobal() {
|
||||
// CheckRelabelConfigs checks -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig.
|
||||
func CheckRelabelConfigs() error {
|
||||
_, err := loadRelabelConfigs()
|
||||
return err
|
||||
}
|
||||
|
||||
func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
var rcs relabelConfigs
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
rcs.global = global
|
||||
}
|
||||
if len(*relabelConfigPaths) > len(*remoteWriteURLs) {
|
||||
return nil, fmt.Errorf("too many -remoteWrite.urlRelabelConfig args: %d; it mustn't exceed the number of -remoteWrite.url args: %d",
|
||||
len(*relabelConfigPaths), len(*remoteWriteURLs))
|
||||
}
|
||||
rcs.perURL = make([][]promrelabel.ParsedRelabelConfig, len(*remoteWriteURLs))
|
||||
for i, path := range *relabelConfigPaths {
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
|
||||
}
|
||||
rcs.perURL[i] = prc
|
||||
}
|
||||
return &rcs, nil
|
||||
}
|
||||
|
||||
type relabelConfigs struct {
|
||||
global []promrelabel.ParsedRelabelConfig
|
||||
perURL [][]promrelabel.ParsedRelabelConfig
|
||||
}
|
||||
|
||||
// initLabelsGlobal must be called after parsing command-line flags.
|
||||
func initLabelsGlobal() {
|
||||
// Init labelsGlobal
|
||||
labelsGlobal = nil
|
||||
for _, s := range *unparsedLabelsGlobal {
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
logger.Panicf("FATAL: missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
logger.Fatalf("missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
labelsGlobal = append(labelsGlobal, prompbmarshal.Label{
|
||||
Name: s[:n],
|
||||
Value: s[n+1:],
|
||||
})
|
||||
}
|
||||
|
||||
// Init prcsGlobal
|
||||
prcsGlobal = nil
|
||||
if len(*relabelConfigPathGlobal) > 0 {
|
||||
var err error
|
||||
prcsGlobal, err = promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, prcs []promrelabel.ParsedRelabelConfig) []prompbmarshal.TimeSeries {
|
||||
|
||||
@@ -3,6 +3,7 @@ package remotewrite
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
@@ -10,8 +11,8 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
xxhash "github.com/cespare/xxhash/v2"
|
||||
)
|
||||
@@ -20,9 +21,8 @@ var (
|
||||
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
|
||||
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
|
||||
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
"isn't enough for sending high volume of collected data to remote storage")
|
||||
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensistive auth info")
|
||||
@@ -34,6 +34,9 @@ var (
|
||||
|
||||
var rwctxs []*remoteWriteCtx
|
||||
|
||||
// Contains the current relabelConfigs.
|
||||
var allRelabelConfigs atomic.Value
|
||||
|
||||
// Init initializes remotewrite.
|
||||
//
|
||||
// It must be called after flag.Parse().
|
||||
@@ -41,14 +44,19 @@ var rwctxs []*remoteWriteCtx
|
||||
// Stop must be called for graceful shutdown.
|
||||
func Init() {
|
||||
if len(*remoteWriteURLs) == 0 {
|
||||
logger.Panicf("FATAL: at least one `-remoteWrite.url` must be set")
|
||||
logger.Fatalf("at least one `-remoteWrite.url` must be set")
|
||||
}
|
||||
|
||||
if !*showRemoteWriteURL {
|
||||
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
|
||||
httpserver.RegisterSecretFlag("remoteWrite.url")
|
||||
}
|
||||
initRelabelGlobal()
|
||||
initLabelsGlobal()
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load relabel configs: %s", err)
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
|
||||
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
|
||||
if maxInmemoryBlocks > 200 {
|
||||
@@ -61,23 +69,47 @@ func Init() {
|
||||
maxInmemoryBlocks = 2
|
||||
}
|
||||
for i, remoteWriteURL := range *remoteWriteURLs {
|
||||
relabelConfigPath := ""
|
||||
if i < len(*relabelConfigPaths) {
|
||||
relabelConfigPath = (*relabelConfigPaths)[i]
|
||||
}
|
||||
urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
|
||||
if *showRemoteWriteURL {
|
||||
urlLabelValue = remoteWriteURL
|
||||
}
|
||||
rwctx := newRemoteWriteCtx(remoteWriteURL, relabelConfigPath, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctxs = append(rwctxs, rwctx)
|
||||
}
|
||||
|
||||
// Start config reloader.
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
configReloaderWG.Add(1)
|
||||
go func() {
|
||||
defer configReloaderWG.Done()
|
||||
for {
|
||||
select {
|
||||
case <-sighupCh:
|
||||
case <-stopCh:
|
||||
return
|
||||
}
|
||||
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
|
||||
continue
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
logger.Infof("Successfully reloaded relabel configs")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var stopCh = make(chan struct{})
|
||||
var configReloaderWG sync.WaitGroup
|
||||
|
||||
// Stop stops remotewrite.
|
||||
//
|
||||
// It is expected that nobody calls Push during and after the call to this func.
|
||||
func Stop() {
|
||||
close(stopCh)
|
||||
configReloaderWG.Wait()
|
||||
|
||||
for _, rwctx := range rwctxs {
|
||||
rwctx.MustStop()
|
||||
}
|
||||
@@ -86,9 +118,11 @@ func Stop() {
|
||||
|
||||
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
|
||||
//
|
||||
// Each timeseries in wr.Timeseries must contain one sample.
|
||||
// Note that wr may be modified by Push due to relabeling.
|
||||
func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var rctx *relabelCtx
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcsGlobal := rcs.global
|
||||
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
|
||||
rctx = getRelabelCtx()
|
||||
}
|
||||
@@ -122,16 +156,18 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
|
||||
|
||||
type remoteWriteCtx struct {
|
||||
idx int
|
||||
fq *persistentqueue.FastQueue
|
||||
c *client
|
||||
prcs []promrelabel.ParsedRelabelConfig
|
||||
pss []*pendingSeries
|
||||
pssNextIdx uint64
|
||||
|
||||
tss []prompbmarshal.TimeSeries
|
||||
|
||||
relabelMetricsDropped *metrics.Counter
|
||||
}
|
||||
|
||||
func newRemoteWriteCtx(remoteWriteURL, relabelConfigPath string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
h := xxhash.Sum64([]byte(remoteWriteURL))
|
||||
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
|
||||
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks, *maxPendingBytesPerURL)
|
||||
@@ -141,24 +177,16 @@ func newRemoteWriteCtx(remoteWriteURL, relabelConfigPath string, maxInmemoryBloc
|
||||
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
|
||||
return float64(fq.GetInmemoryQueueLen())
|
||||
})
|
||||
c := newClient(remoteWriteURL, urlLabelValue, fq, *queues)
|
||||
var prcs []promrelabel.ParsedRelabelConfig
|
||||
if len(relabelConfigPath) > 0 {
|
||||
var err error
|
||||
prcs, err = promrelabel.LoadRelabelConfigs(relabelConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", relabelConfigPath, err)
|
||||
}
|
||||
}
|
||||
c := newClient(argIdx, remoteWriteURL, urlLabelValue, fq, *queues)
|
||||
pss := make([]*pendingSeries, *queues)
|
||||
for i := range pss {
|
||||
pss[i] = newPendingSeries(fq.MustWriteBlock)
|
||||
}
|
||||
return &remoteWriteCtx{
|
||||
fq: fq,
|
||||
c: c,
|
||||
prcs: prcs,
|
||||
pss: pss,
|
||||
idx: argIdx,
|
||||
fq: fq,
|
||||
c: c,
|
||||
pss: pss,
|
||||
|
||||
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, urlLabelValue)),
|
||||
}
|
||||
@@ -168,10 +196,10 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
for _, ps := range rwctx.pss {
|
||||
ps.MustStop()
|
||||
}
|
||||
rwctx.idx = 0
|
||||
rwctx.pss = nil
|
||||
rwctx.fq.MustClose()
|
||||
rwctx.fq = nil
|
||||
rwctx.prcs = nil
|
||||
rwctx.c.MustStop()
|
||||
rwctx.c = nil
|
||||
|
||||
@@ -180,10 +208,17 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
|
||||
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
var rctx *relabelCtx
|
||||
if len(rwctx.prcs) > 0 {
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcs := rcs.perURL[rwctx.idx]
|
||||
if len(prcs) > 0 {
|
||||
// Make a copy of tss before applying relabeling in order to prevent
|
||||
// from affecting time series for other remoteWrite.url configs.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467 for details.
|
||||
rwctx.tss = append(rwctx.tss[:0], tss...)
|
||||
tss = rwctx.tss
|
||||
rctx = getRelabelCtx()
|
||||
tssLen := len(tss)
|
||||
tss = rctx.applyRelabeling(tss, nil, rwctx.prcs)
|
||||
tss = rctx.applyRelabeling(tss, nil, prcs)
|
||||
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
|
||||
}
|
||||
pss := rwctx.pss
|
||||
@@ -191,5 +226,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
pss[idx].Push(tss)
|
||||
if rctx != nil {
|
||||
putRelabelCtx(rctx)
|
||||
// Zero rwctx.tss in order to free up GC references.
|
||||
rwctx.tss = prompbmarshal.ResetTimeSeries(rwctx.tss)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,17 @@ import (
|
||||
"net"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
|
||||
"github.com/VictoriaMetrics/fasthttp"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
func statDial(addr string) (net.Conn, error) {
|
||||
conn, err := fasthttp.Dial(addr)
|
||||
func statDial(addr string) (conn net.Conn, err error) {
|
||||
if netutil.TCP6Enabled() {
|
||||
conn, err = fasthttp.DialDualStack(addr)
|
||||
} else {
|
||||
conn, err = fasthttp.Dial(addr)
|
||||
}
|
||||
dialsTotal.Inc()
|
||||
if err != nil {
|
||||
dialErrors.Inc()
|
||||
|
||||
@@ -52,14 +52,18 @@ publish-vmalert:
|
||||
APP_NAME=vmalert $(MAKE) publish-via-docker
|
||||
|
||||
test-vmalert:
|
||||
go test -race -cover ./app/vmalert
|
||||
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
||||
go test -v -race -cover ./app/vmalert/datasource
|
||||
go test -v -race -cover ./app/vmalert/notifier
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093 \
|
||||
-remotewrite.url=http://localhost:8428 \
|
||||
-remoteread.url=http://localhost:8428 \
|
||||
-notifier.url=http://127.0.0.1:9093 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-evaluationInterval=3s
|
||||
|
||||
vmalert-amd64:
|
||||
|
||||
@@ -1,19 +1,27 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
`vmalert` executes a list of given MetricsQL expressions (rules) and
|
||||
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
expressions validation;
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### TODO:
|
||||
* Configuration hot reload.
|
||||
### Limitations:
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
|
||||
may fail;
|
||||
* by default, rules execution is sequential within one group, but persisting of execution results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one;
|
||||
* there is no `query` function support in templates yet;
|
||||
* `vmalert` has no UI, just an API for getting groups and rules statuses.
|
||||
|
||||
### QuickStart
|
||||
|
||||
@@ -26,10 +34,12 @@ make vmalert
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of alert rules - PromQL/MetricsQL expressions to execute;
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable Alertmanager instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
@@ -38,20 +48,105 @@ Then configure `vmalert` accordingly:
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
```
|
||||
|
||||
`vmalert` runs evaluation for every group in a separate goroutine.
|
||||
Rules in group evaluated one-by-one sequentially.
|
||||
#### Groups
|
||||
|
||||
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
Each group has following attributes:
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# How many rules execute at once. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
||||
#### Rules
|
||||
|
||||
There are two types of Rules:
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allow you to precompute frequently needed or computationally expensive expressions
|
||||
and save their result as a new set of time series.
|
||||
|
||||
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
|
||||
within one group.
|
||||
|
||||
##### Alerting rules
|
||||
|
||||
The syntax for alerting rule is following:
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Alerts are considered firing once they have been returned for this long.
|
||||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
labels:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
|
||||
# Annotations to add to each alert.
|
||||
annotations:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
|
||||
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
|
||||
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
|
||||
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
|
||||
address in form of timeseries with name `ALERTS` via remote-write protocol.
|
||||
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
|
||||
address by querying `ALERTS` timeseries.
|
||||
|
||||
|
||||
##### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
```yaml
|
||||
# The name of the time series to output to. Must be a valid metric name.
|
||||
record: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Labels to add or overwrite before storing the result.
|
||||
labels:
|
||||
[ <labelname>: <labelvalue> ]
|
||||
```
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must specified.
|
||||
|
||||
|
||||
#### WEB
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
|
||||
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
|
||||
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
|
||||
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
||||
### Configuration
|
||||
@@ -63,37 +158,79 @@ Usage of vmalert:
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
|
||||
-datasource.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
|
||||
-datasource.tlsServerName value
|
||||
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules. Default 1m (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
|
||||
-notifier.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
|
||||
-notifier.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
-notifier.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
|
||||
-notifier.tlsServerName value
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteread.basicAuth.password string
|
||||
Optional basic auth password for -remoteread.url
|
||||
-remoteread.basicAuth.username string
|
||||
Optional basic auth username for -remoteread.url
|
||||
-remoteread.lookback duration
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteread.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remotewrite.basicAuth.password string
|
||||
Optional basic auth password for -remotewrite.url
|
||||
-remotewrite.basicAuth.username string
|
||||
Optional basic auth username for -remotewrite.url
|
||||
-remotewrite.url string
|
||||
-remoteRead.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
|
||||
-remoteRead.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
|
||||
-remoteWrite.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
Path to the file with alert rules.
|
||||
@@ -102,15 +239,39 @@ Usage of vmalert:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Indicates to validate annotation and label templates (default true)
|
||||
Whether to validate annotation and label templates (default true)
|
||||
```
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
|
||||
To reload configuration without `vmalert` restart send SIGHUP signal
|
||||
or send GET request to `/-/reload` endpoint.
|
||||
|
||||
### Contributing
|
||||
|
||||
`vmalert` is mostly designed and built by VictoriaMetrics community.
|
||||
Feel free to share your experience and ideas for improving this
|
||||
software. Please keep simplicity as the main priority.
|
||||
|
||||
### How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert` from the root folder of the repository.
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-prod` from the root folder of the repository.
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
374
app/vmalert/alerting.go
Normal file
374
app/vmalert/alerting.go
Normal file
@@ -0,0 +1,374 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For,
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: gID,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
}
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (ar *AlertingRule) String() string {
|
||||
return ar.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := q.Query(ctx, ar.Expr)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
ar.lastExecError = err
|
||||
ar.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(ar.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = ar.template(a)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ar.lastExecTime)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
if a.State == notifier.StatePending {
|
||||
// alert was in Pending state - it is not
|
||||
// active anymore
|
||||
delete(ar.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
if series {
|
||||
return ar.toTimeSeries(ar.lastExecTime), nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
ts := ar.alertToTimeSeries(a, timestamp)
|
||||
tss = append(tss, ts...)
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, ar.template(a)
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(ar.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(ar.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIAlertingRule
|
||||
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
||||
var lastErr string
|
||||
if ar.lastExecError != nil {
|
||||
lastErr = ar.lastExecError.Error()
|
||||
}
|
||||
return APIAlertingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
Name: ar.Name,
|
||||
Expression: ar.Expr,
|
||||
For: ar.For.String(),
|
||||
LastError: lastErr,
|
||||
LastExec: ar.lastExecTime,
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
}
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
|
||||
if ar.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Exec
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := ar.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -10,30 +10,15 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test{"}).Validate(); err == nil {
|
||||
t.Errorf("exptected invalid expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
alert *notifier.Alert
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
newTestRule("instant", 0),
|
||||
newTestAlertingRule("instant", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -44,7 +29,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant extra labels", 0),
|
||||
newTestAlertingRule("instant extra labels", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
@@ -60,7 +45,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant labels override", 0),
|
||||
newTestAlertingRule("instant labels override", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
alertStateLabel: "foo",
|
||||
"__name__": "bar",
|
||||
@@ -74,7 +59,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for", time.Second),
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -89,7 +74,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for pending", 10*time.Second),
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -106,58 +91,29 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
tss := tc.rule.AlertToTimeSeries(tc.alert, timestamp)
|
||||
if len(tc.expTS) != len(tss) {
|
||||
t.Fatalf("expected number of timeseries %d; got %d", len(tc.expTS), len(tss))
|
||||
}
|
||||
for i := range tc.expTS {
|
||||
expTS, gotTS := tc.expTS[i], tss[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
t.Fatalf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
if got.Timestamp != exp.Timestamp {
|
||||
t.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
t.Fatalf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
t.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
tc.rule.alerts[tc.alert.ID] = tc.alert
|
||||
tss := tc.rule.toTimeSeries(timestamp)
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRule(name string, waitFor time.Duration) *Rule {
|
||||
return &Rule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
|
||||
func TestRule_Exec(t *testing.T) {
|
||||
func TestAlertingRule_Exec(t *testing.T) {
|
||||
const defaultStep = 5 * time.Millisecond
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
steps [][]datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestRule("empty", 0),
|
||||
newTestAlertingRule("empty", 0),
|
||||
[][]datasource.Metric{},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("empty labels", 0),
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[][]datasource.Metric{
|
||||
{datasource.Metric{}},
|
||||
},
|
||||
@@ -166,7 +122,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing", 0),
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -175,7 +131,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -185,7 +141,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -196,7 +152,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -208,7 +164,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -219,7 +175,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -233,7 +189,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-firing", 0),
|
||||
newTestAlertingRule("multiple-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
metricWithLabels(t, "name", "foo"),
|
||||
@@ -248,7 +204,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-steps-firing", 0),
|
||||
newTestAlertingRule("multiple-steps-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo1")},
|
||||
@@ -263,7 +219,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("duplicate", 0),
|
||||
newTestAlertingRule("duplicate", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
// metrics with the same labelset should result in one alert
|
||||
@@ -276,7 +232,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending", time.Minute),
|
||||
newTestAlertingRule("for-pending", time.Minute),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -285,7 +241,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-fired", time.Millisecond),
|
||||
newTestAlertingRule("for-fired", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -295,7 +251,17 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>inactive", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>empty", time.Second),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset and delete pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -307,19 +273,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -332,7 +286,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -350,15 +304,15 @@ func TestRule_Exec(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if err := tc.rule.Exec(context.TODO(), fq); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
time.Sleep(time.Millisecond)
|
||||
time.Sleep(defaultStep)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
|
||||
@@ -376,40 +330,9 @@ func TestRule_Exec(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
type fakeQuerier struct {
|
||||
metrics []datasource.Metric
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.metrics = fq.metrics[:0]
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
}
|
||||
|
||||
func (fq fakeQuerier) Query(ctx context.Context, query string) ([]datasource.Metric, error) {
|
||||
return fq.metrics, nil
|
||||
}
|
||||
|
||||
func TestRule_Restore(t *testing.T) {
|
||||
func TestAlertingRule_Restore(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
metrics []datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
@@ -494,7 +417,7 @@ func TestRule_Restore(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
@@ -518,8 +441,8 @@ func TestRule_Restore(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
r := newTestRule(name, 0)
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
r.Labels[labels[i]] = labels[i+1]
|
||||
@@ -527,9 +450,6 @@ func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
return r
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
@@ -1,70 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file patther %s:%v", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
groupsNames := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("file %s: %w", file, err)
|
||||
}
|
||||
for _, group := range gr {
|
||||
if _, ok := groupsNames[group.Name]; ok {
|
||||
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", file, group.Name)
|
||||
}
|
||||
groupsNames[group.Name] = struct{}{}
|
||||
for _, rule := range group.Rules {
|
||||
if err = rule.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid rule filepath:%s, group %s:%w", file, group.Name, err)
|
||||
}
|
||||
// TODO: this init looks weird here
|
||||
rule.alerts = make(map[uint64]*notifier.Alert)
|
||||
if validateAnnotations {
|
||||
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
|
||||
return nil, fmt.Errorf("invalid annotations filepath:%s, group %s:%w", file, group.Name, err)
|
||||
}
|
||||
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
|
||||
return nil, fmt.Errorf("invalid labels filepath:%s, group %s:%w", file, group.Name, err)
|
||||
}
|
||||
}
|
||||
rule.group = group
|
||||
}
|
||||
}
|
||||
groups = append(groups, gr...)
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
return g.Groups, err
|
||||
}
|
||||
195
app/vmalert/config/config.go
Normal file
195
app/vmalert/config/config.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// Group contains list of Rules grouped into
|
||||
// entity with one name and evaluation interval
|
||||
type Group struct {
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// Validate check for internal Group or Rule configuration errors
|
||||
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("group name must be set")
|
||||
}
|
||||
if len(g.Rules) == 0 {
|
||||
return fmt.Errorf("group %q can't contain no rules", g.Name)
|
||||
}
|
||||
uniqueRules := map[uint64]struct{}{}
|
||||
for _, r := range g.Rules {
|
||||
ruleName := r.Record
|
||||
if r.Alert != "" {
|
||||
ruleName = r.Alert
|
||||
}
|
||||
if _, ok := uniqueRules[r.ID]; ok {
|
||||
return fmt.Errorf("rule %q duplicate", ruleName)
|
||||
}
|
||||
uniqueRules[r.ID] = struct{}{}
|
||||
if err := r.Validate(); err != nil {
|
||||
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if validateExpressions {
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
if validateAnnotations {
|
||||
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
|
||||
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if err := notifier.ValidateTemplates(r.Labels); err != nil {
|
||||
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
|
||||
}
|
||||
|
||||
// Rule describes entity that represent either
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
ID uint64
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type rule Rule
|
||||
if err := unmarshal((*rule)(r)); err != nil {
|
||||
return err
|
||||
}
|
||||
r.ID = HashRule(*r)
|
||||
return nil
|
||||
}
|
||||
|
||||
// HashRule hashes significant Rule fields into
|
||||
// unique hash value
|
||||
func HashRule(r Rule) uint64 {
|
||||
h := fnv.New64a()
|
||||
h.Write([]byte(r.Expr))
|
||||
if r.Record != "" {
|
||||
h.Write([]byte("recording"))
|
||||
h.Write([]byte(r.Record))
|
||||
} else {
|
||||
h.Write([]byte("alerting"))
|
||||
h.Write([]byte(r.Alert))
|
||||
}
|
||||
type item struct {
|
||||
key, value string
|
||||
}
|
||||
var kv []item
|
||||
for k, v := range r.Labels {
|
||||
kv = append(kv, item{key: k, value: v})
|
||||
}
|
||||
sort.Slice(kv, func(i, j int) bool {
|
||||
return kv[i].key < kv[j].key
|
||||
})
|
||||
for _, i := range kv {
|
||||
h.Write([]byte(i.key))
|
||||
h.Write([]byte(i.value))
|
||||
h.Write([]byte("\xff"))
|
||||
}
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// Validate check for Rule configuration errors
|
||||
func (r *Rule) Validate() error {
|
||||
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
|
||||
return fmt.Errorf("either `record` or `alert` must be set")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression can't be empty")
|
||||
}
|
||||
return checkOverflow(r.XXX, "rule")
|
||||
}
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
uniqueGroups := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
|
||||
return nil, fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err)
|
||||
}
|
||||
if _, ok := uniqueGroups[g.Name]; ok {
|
||||
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
|
||||
}
|
||||
uniqueGroups[g.Name] = struct{}{}
|
||||
g.File = file
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return g.Groups, checkOverflow(g.XXX, "config")
|
||||
}
|
||||
|
||||
func checkOverflow(m map[string]interface{}, ctx string) error {
|
||||
if len(m) > 0 {
|
||||
var keys []string
|
||||
for k := range m {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
326
app/vmalert/config/config_test.go
Normal file
326
app/vmalert/config/config_test.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
testCases := []struct {
|
||||
path []string
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
[]string{"testdata/rules0-bad.rules"},
|
||||
"unexpected token",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules0-bad.rules"},
|
||||
"error parsing annotation",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules1-bad.rules"},
|
||||
"duplicate in file",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules2-bad.rules"},
|
||||
"function \"value\" not defined",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules3-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules4-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/*.yaml"},
|
||||
"no groups found",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
_, err := Parse(tc.path, true, true)
|
||||
if err == nil {
|
||||
t.Errorf("expected to get error")
|
||||
return
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule; got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroup_Validate(t *testing.T) {
|
||||
testCases := []struct {
|
||||
group *Group
|
||||
rules []Rule
|
||||
validateAnnotations bool
|
||||
validateExpressions bool
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
group: &Group{},
|
||||
expErr: "group name must be set",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test"},
|
||||
expErr: "contain no rules",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "invalid expression",
|
||||
validateExpressions: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "error parsing annotation",
|
||||
validateAnnotations: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"description": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
|
||||
if err == nil {
|
||||
if tc.expErr != "" {
|
||||
t.Errorf("expected to get err %q; got nil insted", tc.expErr)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHashRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
a, b Rule
|
||||
equal bool
|
||||
}{
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "bar",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: time.Minute},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 2"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "baz",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
}
|
||||
for i, tc := range testCases {
|
||||
aID, bID := HashRule(tc.a), HashRule(tc.b)
|
||||
if tc.equal != (aID == bID) {
|
||||
t.Fatalf("missmatch for rule %d", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ groups:
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
expr: "{{ $expr|queryEscape }}"
|
||||
annotations:
|
||||
summary: "{{ $value|humanize }}"
|
||||
description: "{{$labels}}"
|
||||
@@ -9,5 +9,3 @@ groups:
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
|
||||
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
record: record
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
expr: vm_rows > 0
|
||||
- record: rows
|
||||
expr: sum(vm_rows)
|
||||
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11
app/vmalert/config/testdata/rules1-good.rules
vendored
Normal file
11
app/vmalert/config/testdata/rules1-good.rules
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
groups:
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
groups:
|
||||
- name: TestGroup
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
for: 3m
|
||||
annotations:
|
||||
summary: "Too high connection number for {{$labels.instance}}"
|
||||
description: "It is {{ $value }} connections for {{$labels.instance}}"
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job)
|
||||
(up == 1)
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
labels:
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: dev
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: staging
|
||||
recording: true
|
||||
- record: successful_requests:ratio_rate5m
|
||||
labels:
|
||||
recording: true
|
||||
expr: |2
|
||||
sum(code:requests:rate5m{code="200"})
|
||||
/
|
||||
sum(code:requests:rate5m)
|
||||
@@ -1,39 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected syntaxt error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template annotation error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules1-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected same group error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules2-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template label error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/*.yaml"}, true); err == nil {
|
||||
t.Errorf("expected empty group")
|
||||
}
|
||||
}
|
||||
38
app/vmalert/datasource/init.go
Normal file
38
app/vmalert/datasource/init.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
|
||||
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
|
||||
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
|
||||
"By default the server name from -datasource.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
func Init() (Querier, error) {
|
||||
if *addr == "" {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("datasource.url is empty")
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
@@ -32,7 +32,7 @@ func (r response) metrics() ([]Metric, error) {
|
||||
for i, res := range r.Data.Result {
|
||||
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
|
||||
}
|
||||
m.Labels = nil
|
||||
for k, v := range r.Data.Result[i].Labels {
|
||||
@@ -49,9 +49,10 @@ const queryPath = "/api/v1/query?query="
|
||||
|
||||
// VMStorage represents vmstorage entity with ability to read and write metrics
|
||||
type VMStorage struct {
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser, basicAuthPass string
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
}
|
||||
|
||||
// NewVMStorage is a constructor for VMStorage
|
||||
@@ -79,25 +80,25 @@ func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
|
||||
}
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s with err %w. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
}
|
||||
r := &response{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error parsing metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
|
||||
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
}
|
||||
if r.Data.ResultType != rtVector {
|
||||
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
return nil, fmt.Errorf("unknown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
}
|
||||
return r.metrics()
|
||||
}
|
||||
|
||||
300
app/vmalert/group.go
Normal file
300
app/vmalert/group.go
Normal file
@@ -0,0 +1,300 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
|
||||
doneCh chan struct{}
|
||||
finishedCh chan struct{}
|
||||
// channel accepts new Group obj
|
||||
// which supposed to update current group
|
||||
updateCh chan *Group
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
g := &Group{
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval,
|
||||
Concurrency: cfg.Concurrency,
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
if g.Interval == 0 {
|
||||
g.Interval = defaultInterval
|
||||
}
|
||||
if g.Concurrency < 1 {
|
||||
g.Concurrency = 1
|
||||
}
|
||||
rules := make([]Rule, len(cfg.Rules))
|
||||
for i, r := range cfg.Rules {
|
||||
rules[i] = g.newRule(r)
|
||||
}
|
||||
g.Rules = rules
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(g.ID(), rule)
|
||||
}
|
||||
return newRecordingRule(g.ID(), rule)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
// rules file and group name
|
||||
func (g *Group) ID() uint64 {
|
||||
hash := fnv.New64a()
|
||||
hash.Write([]byte(g.File))
|
||||
hash.Write([]byte("\xff"))
|
||||
hash.Write([]byte(g.Name))
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateWith updates existing group with
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
for _, nr := range newGroup.Rules {
|
||||
rulesRegistry[nr.ID()] = nr
|
||||
}
|
||||
|
||||
for i, or := range g.Rules {
|
||||
nr, ok := rulesRegistry[or.ID()]
|
||||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// so we mark it for removing
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
}
|
||||
|
||||
var newRules []Rule
|
||||
for _, r := range g.Rules {
|
||||
if r == nil {
|
||||
// skip nil rules
|
||||
continue
|
||||
}
|
||||
newRules = append(newRules, r)
|
||||
}
|
||||
// add the rest of rules from registry
|
||||
for _, nr := range rulesRegistry {
|
||||
newRules = append(newRules, nr)
|
||||
}
|
||||
g.Concurrency = newGroup.Concurrency
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
|
||||
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
|
||||
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
||||
|
||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
||||
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
||||
|
||||
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
)
|
||||
|
||||
func (g *Group) close() {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
}
|
||||
close(g.doneCh)
|
||||
<-g.finishedCh
|
||||
}
|
||||
|
||||
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{querier, nts, rw}
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Infof("group %q: context cancelled", g.Name)
|
||||
return
|
||||
case <-g.doneCh:
|
||||
logger.Infof("group %q: received stop signal", g.Name)
|
||||
return
|
||||
case ng := <-g.updateCh:
|
||||
g.mu.Lock()
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||
g.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
if g.Interval != ng.Interval {
|
||||
g.Interval = ng.Interval
|
||||
t.Stop()
|
||||
t = time.NewTicker(g.Interval)
|
||||
}
|
||||
g.mu.Unlock()
|
||||
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
case <-t.C:
|
||||
iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
iterationDuration.UpdateDuration(iterationStart)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type executor struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
var returnSeries bool
|
||||
if e.rw != nil {
|
||||
returnSeries = true
|
||||
}
|
||||
|
||||
if concurrency == 1 {
|
||||
// fast path
|
||||
for _, rule := range rules {
|
||||
res <- e.exec(ctx, rule, returnSeries, interval)
|
||||
}
|
||||
close(res)
|
||||
return res
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, rule := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, returnSeries, interval)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
}()
|
||||
return res
|
||||
}
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
|
||||
execTotal.Inc()
|
||||
execStart := time.Now()
|
||||
defer func() {
|
||||
execDuration.UpdateDuration(execStart)
|
||||
}()
|
||||
|
||||
tss, err := rule.Exec(ctx, e.querier, returnSeries)
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
}
|
||||
|
||||
if len(tss) > 0 && e.rw != nil {
|
||||
remoteWriteSent.Add(len(tss))
|
||||
for _, ts := range tss {
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
// set End to execStart + 3 intervals
|
||||
// so notifier can resolve it automatically if `vmalert`
|
||||
// won't be able to send resolve for some reason
|
||||
a.End = time.Now().Add(3 * interval)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = time.Now()
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
alertsSent.Add(len(alerts))
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers {
|
||||
if err := nt.Send(ctx, alerts); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||
}
|
||||
}
|
||||
return errGr.Err()
|
||||
}
|
||||
207
app/vmalert/group_test.go
Normal file
207
app/vmalert/group_test.go
Normal file
@@ -0,0 +1,207 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
currentRules []config.Rule
|
||||
newRules []config.Rule
|
||||
}{
|
||||
{
|
||||
"new rule",
|
||||
nil,
|
||||
[]config.Rule{{Alert: "bar"}},
|
||||
},
|
||||
{
|
||||
"update alerting rule",
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "none",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "max(up)",
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "min(up)",
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"empty rule",
|
||||
[]config.Rule{{Alert: "foo"}, {Record: "bar"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"multiple rules",
|
||||
[]config.Rule{
|
||||
{Alert: "bar"},
|
||||
{Alert: "baz"},
|
||||
{Alert: "foo"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "baz"},
|
||||
{Record: "foo"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"replace rule",
|
||||
[]config.Rule{{Alert: "foo1"}},
|
||||
[]config.Rule{{Alert: "foo2"}},
|
||||
},
|
||||
{
|
||||
"replace multiple rules",
|
||||
[]config.Rule{
|
||||
{Alert: "foo1"},
|
||||
{Record: "foo2"},
|
||||
{Alert: "foo3"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "foo3"},
|
||||
{Alert: "foo4"},
|
||||
{Record: "foo5"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Name: "test"}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(r))
|
||||
}
|
||||
|
||||
ng := &Group{Name: "test"}
|
||||
for _, r := range tc.newRules {
|
||||
r.ID = config.HashRule(r)
|
||||
ng.Rules = append(ng.Rules, ng.newRule(r))
|
||||
}
|
||||
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(g.Rules) != len(tc.newRules) {
|
||||
t.Fatalf("expected to have %d rules; got: %d",
|
||||
len(g.Rules), len(tc.newRules))
|
||||
}
|
||||
sort.Slice(g.Rules, func(i, j int) bool {
|
||||
return g.Rules[i].ID() < g.Rules[j].ID()
|
||||
})
|
||||
sort.Slice(ng.Rules, func(i, j int) bool {
|
||||
return ng.Rules[i].ID() < ng.Rules[j].ID()
|
||||
})
|
||||
for i, r := range g.Rules {
|
||||
got, want := r, ng.Rules[i]
|
||||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true, true)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval)
|
||||
g.Concurrency = 2
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fs := &fakeQuerier{}
|
||||
|
||||
const inst1, inst2, job = "foo", "bar", "baz"
|
||||
m1 := metricWithLabels(t, "instance", inst1, "job", job)
|
||||
m2 := metricWithLabels(t, "instance", inst2, "job", job)
|
||||
|
||||
r := g.Rules[0].(*AlertingRule)
|
||||
alert1, err := r.newAlert(m1, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert1.State = notifier.StateFiring
|
||||
alert1.ID = hash(m1)
|
||||
|
||||
alert2, err := r.newAlert(m2, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert2.State = notifier.StateFiring
|
||||
alert2.ID = hash(m2)
|
||||
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts := fn.getAlerts()
|
||||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
// and set only one datapoint for response
|
||||
fs.add(m1)
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts = fn.getAlerts()
|
||||
expectedAlerts = []notifier.Alert{*alert1}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
g.close()
|
||||
<-finished
|
||||
}
|
||||
232
app/vmalert/helpers_test.go
Normal file
232
app/vmalert/helpers_test.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
err error
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) setErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return nil, fq.err
|
||||
}
|
||||
cp := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
return cp, nil
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := compareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareRules(t *testing.T, a, b Rule) error {
|
||||
t.Helper()
|
||||
switch v := a.(type) {
|
||||
case *AlertingRule:
|
||||
br, ok := b.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
|
||||
}
|
||||
return compareAlertingRules(t, v, br)
|
||||
case *RecordingRule:
|
||||
br, ok := b.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
|
||||
}
|
||||
return compareRecordingRules(t, v, br)
|
||||
default:
|
||||
return fmt.Errorf("unexpected rule type received %T", a)
|
||||
}
|
||||
}
|
||||
|
||||
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if a.For != b.For {
|
||||
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
|
||||
t.Helper()
|
||||
if len(a) != len(b) {
|
||||
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
|
||||
}
|
||||
for i := range a {
|
||||
expTS, gotTS := a[i], b[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
// timestamp validation isn't always correct for now.
|
||||
// this must be improved with time mock.
|
||||
/*if got.Timestamp != exp.Timestamp {
|
||||
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}*/
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
|
||||
t.Helper()
|
||||
if len(as) != len(bs) {
|
||||
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
|
||||
}
|
||||
sort.Slice(as, func(i, j int) bool {
|
||||
return as[i].ID < as[j].ID
|
||||
})
|
||||
sort.Slice(bs, func(i, j int) bool {
|
||||
return bs[i].ID < bs[j].ID
|
||||
})
|
||||
for i := range as {
|
||||
a, b := as[i], bs[i]
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.State != b.State {
|
||||
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
|
||||
}
|
||||
if a.Value != b.Value {
|
||||
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,18 +4,19 @@ import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -30,92 +31,60 @@ Examples:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.`)
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
|
||||
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
|
||||
remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
remoteWriteUsername = flag.String("remotewrite.basicAuth.username", "", "Optional basic auth username for -remotewrite.url")
|
||||
remoteWritePassword = flag.String("remotewrite.basicAuth.password", "", "Optional basic auth password for -remotewrite.url")
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
|
||||
remoteReadURL = flag.String("remoteread.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
remoteReadUsername = flag.String("remoteread.basicAuth.username", "", "Optional basic auth username for -remoteread.url")
|
||||
remoteReadPassword = flag.String("remoteread.basicAuth.password", "", "Optional basic auth password for -remoteread.url")
|
||||
remoteReadLookBack = flag.Duration("remoteread.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
|
||||
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
)
|
||||
|
||||
// TODO: hot configuration reload
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
checkFlags()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
manager, err := newManager(ctx)
|
||||
if err != nil {
|
||||
logger.Fatalf("can not get external url:%s ", err)
|
||||
logger.Fatalf("failed to init: %s", err)
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
|
||||
logger.Infof("reading alert rules configuration file from %s", strings.Join(*rulePath, ";"))
|
||||
groups, err := Parse(*rulePath, *validateTemplates)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot parse configuration file: %s", err)
|
||||
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
|
||||
logger.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
w := &watchdog{
|
||||
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
|
||||
alertProvider: notifier.NewAlertManager(*notifierURL, func(group, name string) string {
|
||||
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, name)
|
||||
}, &http.Client{}),
|
||||
}
|
||||
|
||||
if *remoteWriteURL != "" {
|
||||
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
|
||||
Addr: *remoteWriteURL,
|
||||
FlushInterval: *evaluationInterval,
|
||||
BasicAuthUser: *remoteWriteUsername,
|
||||
BasicAuthPass: *remoteWritePassword,
|
||||
})
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init remotewrite client: %s", err)
|
||||
}
|
||||
w.rw = c
|
||||
}
|
||||
|
||||
var restoreDS *datasource.VMStorage
|
||||
if *remoteReadURL != "" {
|
||||
restoreDS = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
|
||||
}
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
for _, g := range groups {
|
||||
if restoreDS != nil {
|
||||
err := g.Restore(ctx, restoreDS, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", g.Name, err)
|
||||
go func() {
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
sigHup := procutil.NewSighupChan()
|
||||
for {
|
||||
<-sigHup
|
||||
configReloads.Inc()
|
||||
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
|
||||
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("error while reloading rules: %s", err)
|
||||
continue
|
||||
}
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(group Group) {
|
||||
w.run(ctx, group, *evaluationInterval)
|
||||
wg.Done()
|
||||
}(g)
|
||||
}
|
||||
}()
|
||||
|
||||
go httpserver.Serve(*httpListenAddr, (&requestHandler{groups: groups}).handler)
|
||||
rh := &requestHandler{m: manager}
|
||||
go httpserver.Serve(*httpListenAddr, rh.handler)
|
||||
|
||||
sig := procutil.WaitForSigterm()
|
||||
logger.Infof("service received signal %s", sig)
|
||||
@@ -123,89 +92,52 @@ func main() {
|
||||
logger.Fatalf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
cancel()
|
||||
if w.rw != nil {
|
||||
err := w.rw.Close()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
||||
}
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
type watchdog struct {
|
||||
storage *datasource.VMStorage
|
||||
alertProvider notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
manager.close()
|
||||
}
|
||||
|
||||
var (
|
||||
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
|
||||
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
|
||||
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
||||
|
||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
||||
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
||||
|
||||
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
configReloads = metrics.NewCounter(`vmalert_config_last_reload_total`)
|
||||
configReloadErrors = metrics.NewCounter(`vmalert_config_last_reload_errors_total`)
|
||||
configSuccess = metrics.NewCounter(`vmalert_config_last_reload_successful`)
|
||||
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
|
||||
)
|
||||
|
||||
func (w *watchdog) run(ctx context.Context, group Group, evaluationInterval time.Duration) {
|
||||
logger.Infof("watchdog for %s has been started", group.Name)
|
||||
t := time.NewTicker(evaluationInterval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
|
||||
select {
|
||||
case <-t.C:
|
||||
iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
for _, rule := range group.Rules {
|
||||
execTotal.Inc()
|
||||
|
||||
execStart := time.Now()
|
||||
err := rule.Exec(ctx, w.storage)
|
||||
execDuration.UpdateDuration(execStart)
|
||||
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
logger.Errorf("failed to execute rule %q.%q: %s", group.Name, rule.Name, err)
|
||||
continue
|
||||
}
|
||||
|
||||
var alertsToSend []notifier.Alert
|
||||
for _, a := range rule.alerts {
|
||||
if a.State != notifier.StatePending {
|
||||
alertsToSend = append(alertsToSend, *a)
|
||||
}
|
||||
if a.State == notifier.StateInactive || w.rw == nil {
|
||||
continue
|
||||
}
|
||||
tss := rule.AlertToTimeSeries(a, execStart)
|
||||
for _, ts := range tss {
|
||||
remoteWriteSent.Inc()
|
||||
if err := w.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
alertsSent.Add(len(alertsToSend))
|
||||
if err := w.alertProvider.Send(alertsToSend); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
logger.Errorf("failed to send alert for rule %q.%q: %s", group.Name, rule.Name, err)
|
||||
}
|
||||
}
|
||||
iterationDuration.UpdateDuration(iterationStart)
|
||||
case <-ctx.Done():
|
||||
logger.Infof("%s received stop signal", group.Name)
|
||||
return
|
||||
}
|
||||
func newManager(ctx context.Context) (*manager, error) {
|
||||
q, err := datasource.Init()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init datasource: %w", err)
|
||||
}
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
|
||||
}
|
||||
nts, err := notifier.Init(aug)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: q,
|
||||
notifiers: nts,
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
|
||||
}
|
||||
manager.rw = rw
|
||||
|
||||
rr, err := remoteread.Init()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
||||
}
|
||||
manager.rr = rr
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
|
||||
@@ -227,13 +159,39 @@ func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL
|
||||
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
|
||||
}
|
||||
|
||||
func checkFlags() {
|
||||
if *notifierURL == "" {
|
||||
flag.PrintDefaults()
|
||||
logger.Fatalf("notifier.url is empty")
|
||||
func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, validateTemplate bool) (notifier.AlertURLGenerator, error) {
|
||||
if externalAlertSource == "" {
|
||||
return func(alert notifier.Alert) string {
|
||||
return fmt.Sprintf("%s/api/v1/%s/%s/status", externalURL, strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10))
|
||||
}, nil
|
||||
}
|
||||
if *datasourceURL == "" {
|
||||
flag.PrintDefaults()
|
||||
logger.Fatalf("datasource.url is empty")
|
||||
if validateTemplate {
|
||||
if err := notifier.ValidateTemplates(map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
|
||||
}
|
||||
}
|
||||
m := map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}
|
||||
return func(alert notifier.Alert) string {
|
||||
templated, err := alert.ExecTemplate(m)
|
||||
if err != nil {
|
||||
logger.Errorf("can not exec source template %s", err)
|
||||
}
|
||||
return fmt.Sprintf("%s/%s", externalURL, templated["tpl"])
|
||||
}, nil
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmalert processes alerts and recording rules.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
53
app/vmalert/main_test.go
Normal file
53
app/vmalert/main_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
expURL := "https://vicotriametrics.com/path"
|
||||
u, err := getExternalURL(expURL, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
h, _ := os.Hostname()
|
||||
expURL = fmt.Sprintf("https://%s:4242", h)
|
||||
u, err = getExternalURL("", "0.0.0.0:4242", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAlertURLGenerator(t *testing.T) {
|
||||
testAlert := notifier.Alert{GroupID: 42, ID: 2, Value: 4}
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
fn, err := getAlertURLGenerator(u, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/api/v1/42/2/status"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
|
||||
if err == nil {
|
||||
t.Errorf("exptected tempalte validation error got nil")
|
||||
}
|
||||
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/foo?query=4"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
}
|
||||
135
app/vmalert/manager.go
Normal file
135
app/vmalert/manager.go
Normal file
@@ -0,0 +1,135 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rr datasource.Querier
|
||||
|
||||
wg sync.WaitGroup
|
||||
|
||||
groupsMu sync.RWMutex
|
||||
groups map[uint64]*Group
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("can't find group with id %q", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
}
|
||||
|
||||
func (m *manager) close() {
|
||||
if m.rw != nil {
|
||||
err := m.rw.Close()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
||||
}
|
||||
}
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
m.wg.Add(1)
|
||||
id := group.ID()
|
||||
go func() {
|
||||
group.start(ctx, m.querier, m.notifiers, m.rw)
|
||||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = group
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
|
||||
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse configuration file: %w", err)
|
||||
}
|
||||
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
m.groupsMu.Lock()
|
||||
for _, og := range m.groups {
|
||||
ng, ok := groupsRegistry[og.ID()]
|
||||
if !ok {
|
||||
// old group is not present in new list
|
||||
// and must be stopped and deleted
|
||||
og.close()
|
||||
delete(m.groups, og.ID())
|
||||
og = nil
|
||||
continue
|
||||
}
|
||||
og.updateCh <- ng
|
||||
delete(groupsRegistry, ng.ID())
|
||||
}
|
||||
|
||||
for _, ng := range groupsRegistry {
|
||||
m.startGroup(ctx, ng, restore)
|
||||
}
|
||||
m.groupsMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
ag := APIGroup{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
Name: g.Name,
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
case *AlertingRule:
|
||||
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
||||
case *RecordingRule:
|
||||
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
||||
}
|
||||
}
|
||||
return ag
|
||||
}
|
||||
211
app/vmalert/manager_test.go
Normal file
211
app/vmalert/manager_test.go
Normal file
@@ -0,0 +1,211 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestManagerUpdateError(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
path := []string{"foo/bar"}
|
||||
err := m.update(context.Background(), path, true, true, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to have err; got nil instead")
|
||||
}
|
||||
expErr := "no groups found"
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to got err %s; got %s", expErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestManagerUpdateConcurrent supposed to test concurrent
|
||||
// execution of configuration update.
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-bad.rules",
|
||||
"config/testdata/dir/rules1-good.rules",
|
||||
"config/testdata/dir/rules1-bad.rules",
|
||||
"config/testdata/rules0-good.rules",
|
||||
"config/testdata/rules1-good.rules",
|
||||
"config/testdata/rules2-good.rules",
|
||||
}
|
||||
*evaluationInterval = time.Millisecond
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
|
||||
t.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
const workers = 500
|
||||
const iterations = 10
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
_ = m.update(context.Background(), path, true, true, false)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// TestManagerUpdate tests sequential configuration
|
||||
// updates.
|
||||
func TestManagerUpdate(t *testing.T) {
|
||||
const defaultEvalInterval = time.Second * 30
|
||||
currentEvalInterval := *evaluationInterval
|
||||
*evaluationInterval = defaultEvalInterval
|
||||
defer func() {
|
||||
*evaluationInterval = currentEvalInterval
|
||||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
Labels: map[string]string{
|
||||
"label": "bar",
|
||||
"host": "{{ $labels.instance }}",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
"summary": "Too high connection number for {{$labels.instance}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
initPath string
|
||||
updatePath string
|
||||
want []*Group
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
Labels: map[string]string{"label": "bar"},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "config/testdata/dir/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules0-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update with one bad rule file",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
|
||||
path := []string{tc.initPath}
|
||||
if err := m.update(ctx, path, true, true, false); err != nil {
|
||||
t.Fatalf("failed to complete initial rules update: %s", err)
|
||||
}
|
||||
|
||||
path = []string{tc.updatePath}
|
||||
_ = m.update(ctx, path, true, true, false)
|
||||
if len(tc.want) != len(m.groups) {
|
||||
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
|
||||
}
|
||||
|
||||
for _, wantG := range tc.want {
|
||||
gotG, ok := m.groups[wantG.ID()]
|
||||
if !ok {
|
||||
t.Fatalf("expected to have group %q", wantG.Name)
|
||||
}
|
||||
compareGroups(t, wantG, gotG)
|
||||
}
|
||||
|
||||
cancel()
|
||||
m.close()
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -7,17 +7,20 @@ import (
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
// Alert the triggered alert
|
||||
// TODO: Looks like alert name isn't unique
|
||||
type Alert struct {
|
||||
Group string
|
||||
GroupID uint64
|
||||
Name string
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
State AlertState
|
||||
|
||||
Expr string
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Value float64
|
||||
@@ -52,14 +55,15 @@ func (as AlertState) String() string {
|
||||
type alertTplData struct {
|
||||
Labels map[string]string
|
||||
Value float64
|
||||
Expr string
|
||||
}
|
||||
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}`
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
|
||||
|
||||
// ExecTemplate executes the Alert template for give
|
||||
// map of annotations.
|
||||
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
|
||||
tplData := alertTplData{Value: a.Value, Labels: a.Labels}
|
||||
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
|
||||
return templateAnnotations(annotations, tplHeader, tplData)
|
||||
}
|
||||
|
||||
@@ -75,7 +79,7 @@ func ValidateTemplates(annotations map[string]string) error {
|
||||
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
|
||||
var builder strings.Builder
|
||||
var buf bytes.Buffer
|
||||
eg := errGroup{}
|
||||
eg := new(utils.ErrGroup)
|
||||
r := make(map[string]string, len(annotations))
|
||||
for key, text := range annotations {
|
||||
r[key] = text
|
||||
@@ -85,21 +89,21 @@ func templateAnnotations(annotations map[string]string, header string, data aler
|
||||
builder.WriteString(header)
|
||||
builder.WriteString(text)
|
||||
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
|
||||
eg.errs = append(eg.errs, fmt.Sprintf("key %s, template %s:%s", key, text, err))
|
||||
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
|
||||
continue
|
||||
}
|
||||
r[key] = buf.String()
|
||||
}
|
||||
return r, eg.err()
|
||||
return r, eg.Err()
|
||||
}
|
||||
|
||||
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
|
||||
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing annotation:%w", err)
|
||||
return fmt.Errorf("error parsing annotation: %w", err)
|
||||
}
|
||||
if err = tpl.Execute(dst, data); err != nil {
|
||||
return fmt.Errorf("error evaluating annotation template:%w", err)
|
||||
return fmt.Errorf("error evaluating annotation template: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,22 +1,24 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAlert_ExecTemplate(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
alert *Alert
|
||||
annotations map[string]string
|
||||
expTpl map[string]string
|
||||
}{
|
||||
{
|
||||
name: "empty-alert",
|
||||
alert: &Alert{},
|
||||
annotations: map[string]string{},
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "no-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
@@ -27,6 +29,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "label-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
@@ -43,10 +46,24 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
"description": "It is 10000 connections for localhost",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "expression-template",
|
||||
alert: &Alert{
|
||||
Expr: `vm_rows{"label"="bar"}>0`,
|
||||
},
|
||||
annotations: map[string]string{
|
||||
"exprEscapedQuery": "{{ $expr|quotesEscape|queryEscape }}",
|
||||
"exprEscapedPath": "{{ $expr|quotesEscape|pathEscape }}",
|
||||
},
|
||||
expTpl: map[string]string{
|
||||
"exprEscapedQuery": "vm_rows%7B%5C%22label%5C%22%3D%5C%22bar%5C%22%7D%3E0",
|
||||
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tpl, err := tc.alert.ExecTemplate(tc.annotations)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
|
||||
@@ -2,6 +2,7 @@ package notifier
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
@@ -11,25 +12,38 @@ import (
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
alertURL string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
alertURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(alerts []Alert) error {
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
b := &bytes.Buffer{}
|
||||
writeamRequest(b, alerts, am.argFunc)
|
||||
resp, err := am.client.Post(am.alertURL, "application/json", b)
|
||||
|
||||
req, err := http.NewRequest("POST", am.alertURL, b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req = req.WithContext(ctx)
|
||||
if am.basicAuthPass != "" {
|
||||
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
|
||||
}
|
||||
resp, err := am.client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read response from %q: %s", am.alertURL, err)
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
|
||||
}
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
|
||||
}
|
||||
@@ -37,15 +51,18 @@ func (am *AlertManager) Send(alerts []Alert) error {
|
||||
}
|
||||
|
||||
// AlertURLGenerator returns URL to single alert by given name
|
||||
type AlertURLGenerator func(group, id string) string
|
||||
type AlertURLGenerator func(Alert) string
|
||||
|
||||
const alertManagerPath = "/api/v2/alerts"
|
||||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
return &AlertManager{
|
||||
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
alertURL: addr,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
basicAuthUser: user,
|
||||
basicAuthPass: pass,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
{% import (
|
||||
"strconv"
|
||||
"time"
|
||||
) %}
|
||||
{% stripspace %}
|
||||
|
||||
{% func amRequest(alerts []Alert, generatorURL func(string, string) string) %}
|
||||
{% func amRequest(alerts []Alert, generatorURL func(Alert) string) %}
|
||||
[
|
||||
{% for i, alert := range alerts %}
|
||||
{
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)) %},
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(alert) %},
|
||||
{% if !alert.End.IsZero() %}
|
||||
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
|
||||
{% endif %}
|
||||
|
||||
@@ -6,126 +6,125 @@ package notifier
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:1
|
||||
import (
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
import (
|
||||
qtio422016 "io"
|
||||
|
||||
qt422016 "github.com/valyala/quicktemplate"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
var (
|
||||
_ = qtio422016.Copy
|
||||
_ = qt422016.AcquireByteBuffer
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:7
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
qw422016.N().S(`[`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:9
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
for i, alert := range alerts {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:9
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
qw422016.N().S(`{"startsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().S(`,"generatorURL":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
qw422016.N().Q(generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().Q(generatorURL(alert))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
if !alert.End.IsZero() {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
qw422016.N().S(`"endsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:15
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:15
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
qw422016.N().S(`"labels": {"alertname":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:16
|
||||
qw422016.N().Q(alert.Name)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
for k, v := range alert.Labels {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:20
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:20
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
qw422016.N().S(`},"annotations": {`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:23
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:22
|
||||
c := len(alert.Annotations)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:24
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:23
|
||||
for k, v := range alert.Annotations {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:24
|
||||
c = c - 1
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
if c > 0 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:27
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:27
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
qw422016.N().S(`}}`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
if i != len(alerts)-1 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:31
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:31
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
qw422016.N().S(`]`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
streamamRequest(qw422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseWriter(qw422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
func amRequest(alerts []Alert, generatorURL func(string, string) string) string {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func amRequest(alerts []Alert, generatorURL func(Alert) string) string {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
writeamRequest(qb422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qs422016 := string(qb422016.B)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseByteBuffer(qb422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
return qs422016
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:33
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
@@ -1,20 +1,31 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAlertManager_Send(t *testing.T) {
|
||||
const baUser, baPass = "foo", "bar"
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Errorf("should not be called")
|
||||
})
|
||||
c := -1
|
||||
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
user, pass, ok := r.BasicAuth()
|
||||
if !ok {
|
||||
t.Errorf("unauthorized request")
|
||||
}
|
||||
if user != baUser || pass != baPass {
|
||||
t.Errorf("wrong creds %q:%q; expected %q:%q",
|
||||
user, pass, baUser, baPass)
|
||||
}
|
||||
c++
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("expected POST method got %s", r.Method)
|
||||
@@ -40,8 +51,8 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
if len(a) != 1 {
|
||||
t.Errorf("expected 1 alert in array got %d", len(a))
|
||||
}
|
||||
if a[0].GeneratorURL != "group0" {
|
||||
t.Errorf("exptected alert0 as generatorURL got %s", a[0].GeneratorURL)
|
||||
if a[0].GeneratorURL != "0/0" {
|
||||
t.Errorf("exptected 0/0 as generatorURL got %s", a[0].GeneratorURL)
|
||||
}
|
||||
if a[0].Labels["alertname"] != "alert0" {
|
||||
t.Errorf("exptected alert0 as alert name got %s", a[0].Labels["alertname"])
|
||||
@@ -56,17 +67,17 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
am := NewAlertManager(srv.URL, func(group, name string) string {
|
||||
return group + name
|
||||
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
|
||||
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
|
||||
}, srv.Client())
|
||||
if err := am.Send([]Alert{{}, {}}); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
|
||||
t.Error("expected connection error got nil")
|
||||
}
|
||||
if err := am.Send([]Alert{}); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{}); err == nil {
|
||||
t.Error("expected wrong http code error got nil")
|
||||
}
|
||||
if err := am.Send([]Alert{{
|
||||
Group: "group",
|
||||
if err := am.Send(context.Background(), []Alert{{
|
||||
GroupID: 0,
|
||||
Name: "alert0",
|
||||
Start: time.Now().UTC(),
|
||||
End: time.Now().UTC(),
|
||||
|
||||
47
app/vmalert/notifier/init.go
Normal file
47
app/vmalert/notifier/init.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
)
|
||||
|
||||
var (
|
||||
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
|
||||
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
|
||||
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
|
||||
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
|
||||
"By default the server name from -notifier.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Notifier object based on provided flags.
|
||||
func Init(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
if len(*addrs) == 0 {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
|
||||
}
|
||||
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
|
||||
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
|
||||
tr, err := utils.Transport(addr, cert, key, ca, serverName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
|
||||
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
|
||||
notifiers = append(notifiers, am)
|
||||
}
|
||||
|
||||
return notifiers, nil
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
package notifier
|
||||
|
||||
import "context"
|
||||
|
||||
// Notifier is common interface for alert manager provider
|
||||
type Notifier interface {
|
||||
Send(alerts []Alert) error
|
||||
Send(ctx context.Context, alerts []Alert) error
|
||||
}
|
||||
|
||||
13
app/vmalert/notifier/package_test.go
Normal file
13
app/vmalert/notifier/package_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
@@ -142,6 +142,15 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
"externalURL": func() string {
|
||||
return externalURL.String()
|
||||
},
|
||||
"pathEscape": func(u string) string {
|
||||
return url.PathEscape(u)
|
||||
},
|
||||
"queryEscape": func(q string) string {
|
||||
return url.QueryEscape(q)
|
||||
},
|
||||
"quotesEscape": func(q string) string {
|
||||
return strings.Replace(q, `"`, `\"`, -1)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type errGroup struct {
|
||||
errs []string
|
||||
}
|
||||
|
||||
func (eg *errGroup) err() error {
|
||||
if eg == nil || len(eg.errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return eg
|
||||
}
|
||||
|
||||
func (eg *errGroup) Error() string {
|
||||
return fmt.Sprintf("errors:%s", strings.Join(eg.errs, "\n"))
|
||||
}
|
||||
149
app/vmalert/recording.go
Normal file
149
app/vmalert/recording.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// RecordingRule is a Rule that supposed
|
||||
// to evaluate configured Expression and
|
||||
// return TimeSeries as result.
|
||||
type RecordingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
Labels map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (rr *RecordingRule) String() string {
|
||||
return rr.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (rr *RecordingRule) ID() uint64 {
|
||||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
|
||||
return &RecordingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Record,
|
||||
Expr: cfg.Expr,
|
||||
Labels: cfg.Labels,
|
||||
GroupID: gID,
|
||||
}
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
if !series {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
qMetrics, err := q.Query(ctx, rr.Expr)
|
||||
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
rr.lastExecTime = time.Now()
|
||||
rr.lastExecError = err
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
}
|
||||
|
||||
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, r := range qMetrics {
|
||||
ts := rr.toTimeSeries(r, rr.lastExecTime)
|
||||
h := hashTimeSeries(ts)
|
||||
if _, ok := duplicates[h]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, errDuplicate
|
||||
}
|
||||
duplicates[h] = ts
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := ts.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
labels[l.Name] = l.Value
|
||||
}
|
||||
labels["__name__"] = rr.Name
|
||||
// override existing labels with configured ones
|
||||
for k, v := range rr.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Value, labels, timestamp)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
return nil
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIRecordingRule
|
||||
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
||||
var lastErr string
|
||||
if rr.lastExecError != nil {
|
||||
lastErr = rr.lastExecError.Error()
|
||||
}
|
||||
return APIRecordingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
Name: rr.Name,
|
||||
Expression: rr.Expr,
|
||||
LastError: lastErr,
|
||||
LastExec: rr.lastExecTime,
|
||||
Labels: rr.Labels,
|
||||
}
|
||||
}
|
||||
121
app/vmalert/recording_test.go
Normal file
121
app/vmalert/recording_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
metrics []datasource.Metric
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
[]datasource.Metric{metricWithValueAndLabels(t, 10,
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(10, map[string]string{
|
||||
"__name__": "foo",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
|
||||
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}, timestamp),
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}, timestamp),
|
||||
newTimeSeries(3, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tss, err := tc.rule.Exec(context.TODO(), fq, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
|
||||
_, err := rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), errDuplicate.Error()) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
|
||||
}
|
||||
}
|
||||
39
app/vmalert/remoteread/init.go
Normal file
39
app/vmalert/remoteread/init.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package remoteread
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
|
||||
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
|
||||
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
|
||||
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
|
||||
"By default the server name from -remoteRead.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init() (datasource.Querier, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
54
app/vmalert/remotewrite/init.go
Normal file
54
app/vmalert/remotewrite/init.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
|
||||
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
|
||||
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
|
||||
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
|
||||
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used")
|
||||
)
|
||||
|
||||
// Init creates Client object from given flags.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init(ctx context.Context) (*Client, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
|
||||
return NewClient(ctx, Config{
|
||||
Addr: *addr,
|
||||
Concurrency: *concurrency,
|
||||
MaxQueueSize: *maxQueueSize,
|
||||
MaxBatchSize: *maxBatchSize,
|
||||
FlushInterval: *flushInterval,
|
||||
BasicAuthUser: *basicAuthUsername,
|
||||
BasicAuthPass: *basicAuthPassword,
|
||||
Transport: t,
|
||||
})
|
||||
}
|
||||
@@ -38,23 +38,30 @@ type Config struct {
|
||||
BasicAuthUser string
|
||||
BasicAuthPass string
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// WriteTimeout defines timeout for HTTP write request
|
||||
// to remote storage
|
||||
WriteTimeout time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 100
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
@@ -80,7 +87,8 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: cfg.WriteTimeout,
|
||||
Timeout: cfg.WriteTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
|
||||
baUser: cfg.BasicAuthUser,
|
||||
@@ -90,7 +98,13 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
c.run(ctx)
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
@@ -103,7 +117,8 @@ func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries)",
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
@@ -127,7 +142,10 @@ func (c *Client) run(ctx context.Context) {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
|
||||
@@ -2,339 +2,23 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// Group grouping array of alert
|
||||
type Group struct {
|
||||
Name string
|
||||
Rules []*Rule
|
||||
}
|
||||
|
||||
// Restore restores alerts state for all group rules with For > 0
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
if rule.For == 0 {
|
||||
return nil
|
||||
}
|
||||
if err := rule.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Rule is basic alert entity
|
||||
type Rule struct {
|
||||
Name string `yaml:"alert"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels"`
|
||||
Annotations map[string]string `yaml:"annotations"`
|
||||
|
||||
group Group
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
// Validate validates rule
|
||||
func (r *Rule) Validate() error {
|
||||
if r.Name == "" {
|
||||
return errors.New("rule name can not be empty")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
|
||||
}
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Exec executes Rule expression via the given Querier.
|
||||
// Based on the Querier results Rule maintains notifier.Alerts
|
||||
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
||||
qMetrics, err := q.Query(ctx, r.Expr)
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
r.lastExecError = err
|
||||
r.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// cleanup inactive alerts from previous Eval
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(r.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := r.alerts[h]; ok {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
continue
|
||||
}
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
r.lastExecError = err
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
r.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
a.State = notifier.StateInactive
|
||||
// set endTime to last execution time
|
||||
// so it can be sent by notifier on next step
|
||||
a.End = r.lastExecTime
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
if a.State == notifier.StateFiring {
|
||||
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
Group: r.group.Name,
|
||||
Name: r.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: time.Now(),
|
||||
// TODO: support End time
|
||||
}
|
||||
|
||||
// 1. use data labels
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
|
||||
// 2. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(r.Labels)
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
|
||||
// 3. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 4. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(r.Annotations)
|
||||
return a, err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (r *Rule) AlertAPI(id uint64) *APIAlert {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
a, ok := r.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return r.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (r *Rule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
r.mu.RLock()
|
||||
for _, a := range r.alerts {
|
||||
alerts = append(alerts, r.newAlertAPI(*a))
|
||||
}
|
||||
r.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
Name: a.Name,
|
||||
Group: a.Group,
|
||||
Expression: r.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// AlertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
|
||||
if r.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Eval, as well as Value field.
|
||||
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, r.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Eval
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := r.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
a.Start = time.Unix(int64(m.Value), 0)
|
||||
r.alerts[a.ID] = a
|
||||
logger.Infof("alert %q.%q restored to state at %v", a.Group, a.Name, a.Start)
|
||||
}
|
||||
return nil
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// Returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
// and Querier. If returnSeries is true, Exec
|
||||
// may return TimeSeries as result of execution
|
||||
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
}
|
||||
|
||||
28
app/vmalert/utils.go
Normal file
28
app/vmalert/utils.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
43
app/vmalert/utils/err_group.go
Normal file
43
app/vmalert/utils/err_group.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ErrGroup accumulates multiple errors
|
||||
// and produces single error message.
|
||||
type ErrGroup struct {
|
||||
errs []error
|
||||
}
|
||||
|
||||
// Add adds a new error to group.
|
||||
// Isn't thread-safe.
|
||||
func (eg *ErrGroup) Add(err error) {
|
||||
eg.errs = append(eg.errs, err)
|
||||
}
|
||||
|
||||
// Err checks if group contains at least
|
||||
// one error.
|
||||
func (eg *ErrGroup) Err() error {
|
||||
if eg == nil || len(eg.errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return eg
|
||||
}
|
||||
|
||||
// Error satisfies Error interface
|
||||
func (eg *ErrGroup) Error() string {
|
||||
if len(eg.errs) == 0 {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
|
||||
for i, err := range eg.errs {
|
||||
b.WriteString(err.Error())
|
||||
if i != len(eg.errs)-1 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
38
app/vmalert/utils/err_group_test.go
Normal file
38
app/vmalert/utils/err_group_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestErrGroup(t *testing.T) {
|
||||
testCases := []struct {
|
||||
errs []error
|
||||
exp string
|
||||
}{
|
||||
{nil, ""},
|
||||
{[]error{errors.New("timeout")}, "errors(1): timeout"},
|
||||
{
|
||||
[]error{errors.New("timeout"), errors.New("deadline")},
|
||||
"errors(2): timeout\ndeadline",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
eg := new(ErrGroup)
|
||||
for _, err := range tc.errs {
|
||||
eg.Add(err)
|
||||
}
|
||||
if len(tc.errs) == 0 {
|
||||
if eg.Err() != nil {
|
||||
t.Fatalf("expected to get nil error")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if eg.Err() == nil {
|
||||
t.Fatalf("expected to get non-nil error")
|
||||
}
|
||||
if eg.Error() != tc.exp {
|
||||
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
58
app/vmalert/utils/tls.go
Normal file
58
app/vmalert/utils/tls.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Transport creates http.Transport object based on provided URL.
|
||||
// Returns Transport with TLS configuration if URL contains `https` prefix
|
||||
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
|
||||
t := http.DefaultTransport.(*http.Transport).Clone()
|
||||
if !strings.HasPrefix(URL, "https") {
|
||||
return t, nil
|
||||
}
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t.TLSClientConfig = tlsCfg
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// TLSConfig creates tls.Config object from provided arguments
|
||||
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
|
||||
var certs []tls.Certificate
|
||||
if certFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
|
||||
}
|
||||
|
||||
certs = []tls.Certificate{cert}
|
||||
}
|
||||
|
||||
var rootCAs *x509.CertPool
|
||||
if CAFile != "" {
|
||||
pem, err := ioutil.ReadFile(CAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
|
||||
}
|
||||
|
||||
rootCAs = x509.NewCertPool()
|
||||
if !rootCAs.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
|
||||
}
|
||||
}
|
||||
|
||||
return &tls.Config{
|
||||
Certificates: certs,
|
||||
InsecureSkipVerify: insecureSkipVerify,
|
||||
RootCAs: rootCAs,
|
||||
ServerName: serverName,
|
||||
}, nil
|
||||
}
|
||||
52
app/vmalert/utils/tls_test.go
Normal file
52
app/vmalert/utils/tls_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package utils
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestTLSConfig(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
serverName = "test"
|
||||
insecureSkipVerify = true
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tlsCfg == nil {
|
||||
t.Errorf("expected tlsConfig to be set, got nil")
|
||||
}
|
||||
if tlsCfg.ServerName != serverName {
|
||||
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
|
||||
}
|
||||
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
|
||||
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
|
||||
}
|
||||
certFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected keypair error, got nil")
|
||||
}
|
||||
certFile = ""
|
||||
CAFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected read error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransport(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
URL := "http://victoriametrics.com"
|
||||
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
URL = "https://victoriametrics.com"
|
||||
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tr.TLSClientConfig == nil {
|
||||
t.Errorf("expected TLSClientConfig to be set, got nil")
|
||||
}
|
||||
}
|
||||
@@ -7,33 +7,23 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
// APIAlert has info for an alert.
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Group string `json:"group"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
type requestHandler struct {
|
||||
groups []Group
|
||||
m *manager
|
||||
}
|
||||
|
||||
var pathList = [][]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupName/alertID/status", "get alert status by ID"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
// /metrics is served by httpserver by default
|
||||
{"/metrics", "list of application metrics"},
|
||||
{"/-/reload", "reload configuration"},
|
||||
}
|
||||
|
||||
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
@@ -45,8 +35,16 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
}
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
resph.handle(rh.listGroups())
|
||||
return true
|
||||
case "/api/v1/alerts":
|
||||
resph.handle(rh.list())
|
||||
resph.handle(rh.listAlerts())
|
||||
return true
|
||||
case "/-/reload":
|
||||
logger.Infof("api config reload was called, sending sighup")
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
default:
|
||||
// /api/v1/<groupName>/<alertID>/status
|
||||
@@ -58,6 +56,37 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
}
|
||||
|
||||
type listGroupsResponse struct {
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listGroupsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
sort.Slice(lr.Data.Groups, func(i, j int) bool {
|
||||
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
|
||||
})
|
||||
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
type listAlertsResponse struct {
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
@@ -65,11 +94,18 @@ type listAlertsResponse struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) list() ([]byte, error) {
|
||||
func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listAlertsResponse{Status: "success"}
|
||||
for _, g := range rh.groups {
|
||||
for _, g := range rh.m.groups {
|
||||
for _, r := range g.Rules {
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
|
||||
a, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +117,7 @@ func (rh *requestHandler) list() ([]byte, error) {
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
@@ -89,6 +125,9 @@ func (rh *requestHandler) list() ([]byte, error) {
|
||||
}
|
||||
|
||||
func (rh *requestHandler) alert(path string) ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
parts := strings.SplitN(strings.TrimPrefix(path, "/api/v1/"), "/", 3)
|
||||
if len(parts) != 3 {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
@@ -96,29 +135,20 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
|
||||
StatusCode: http.StatusBadRequest,
|
||||
}
|
||||
}
|
||||
group := strings.TrimRight(parts[0], "/")
|
||||
idStr := strings.TrimRight(parts[1], "/")
|
||||
id, err := strconv.ParseUint(idStr, 10, 0)
|
||||
|
||||
groupID, err := uint64FromPath(parts[0])
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`cannot parse int from %q`, idStr),
|
||||
StatusCode: http.StatusBadRequest,
|
||||
}
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
|
||||
}
|
||||
for _, g := range rh.groups {
|
||||
if g.Name != group {
|
||||
continue
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if apiAlert := rule.AlertAPI(id); apiAlert != nil {
|
||||
return json.Marshal(apiAlert)
|
||||
}
|
||||
}
|
||||
alertID, err := uint64FromPath(parts[1])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
|
||||
}
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`cannot find alert %s in %q`, idStr, group),
|
||||
StatusCode: http.StatusNotFound,
|
||||
resp, err := rh.m.AlertAPI(groupID, alertID)
|
||||
if err != nil {
|
||||
return nil, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
return json.Marshal(resp)
|
||||
}
|
||||
|
||||
// responseHandler wrapper on http.ResponseWriter with sugar
|
||||
@@ -132,3 +162,19 @@ func (w responseHandler) handle(b []byte, err error) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write(b)
|
||||
}
|
||||
|
||||
func uint64FromPath(path string) (uint64, error) {
|
||||
s := strings.TrimRight(path, "/")
|
||||
return strconv.ParseUint(s, 10, 0)
|
||||
}
|
||||
|
||||
func badRequest(err error) *httpserver.ErrorWithStatusCode {
|
||||
return errResponse(err, http.StatusBadRequest)
|
||||
}
|
||||
|
||||
func errResponse(err error, sc int) *httpserver.ErrorWithStatusCode {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: err,
|
||||
StatusCode: sc,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,18 +11,20 @@ import (
|
||||
)
|
||||
|
||||
func TestHandler(t *testing.T) {
|
||||
rule := &Rule{
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {},
|
||||
},
|
||||
}
|
||||
rh := &requestHandler{
|
||||
groups: []Group{{
|
||||
Name: "group",
|
||||
Rules: []*Rule{rule},
|
||||
}},
|
||||
g := &Group{
|
||||
Name: "group",
|
||||
Rules: []Rule{ar},
|
||||
}
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m.groups[0] = g
|
||||
rh := &requestHandler{m: m}
|
||||
|
||||
getResp := func(url string, to interface{}, code int) {
|
||||
t.Helper()
|
||||
resp, err := http.Get(url)
|
||||
@@ -52,19 +54,26 @@ func TestHandler(t *testing.T) {
|
||||
t.Errorf("expected 1 alert got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/group/0/status", func(t *testing.T) {
|
||||
t.Run("/api/v1/groups", func(t *testing.T) {
|
||||
lr := listGroupsResponse{}
|
||||
getResp(ts.URL+"/api/v1/groups", &lr, 200)
|
||||
if length := len(lr.Data.Groups); length != 1 {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/0/status", func(t *testing.T) {
|
||||
alert := &APIAlert{}
|
||||
getResp(ts.URL+"/api/v1/group/0/status", alert, 200)
|
||||
expAlert := rule.newAlertAPI(*rule.alerts[0])
|
||||
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/group/1/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/group/1/status", nil, 404)
|
||||
t.Run("/api/v1/0/1/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/0/1/status", nil, 404)
|
||||
})
|
||||
t.Run("/api/v1/unknown-group/0/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/unknown-group/0/status", nil, 404)
|
||||
t.Run("/api/v1/1/0/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
|
||||
})
|
||||
t.Run("/", func(t *testing.T) {
|
||||
getResp(ts.URL, nil, 200)
|
||||
|
||||
54
app/vmalert/web_types.go
Normal file
54
app/vmalert/web_types.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.AlertingRule state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
type APIAlertingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
For string `json:"for"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
}
|
||||
|
||||
// APIRecordingRule represents RecordingRule for WEB view
|
||||
type APIRecordingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
}
|
||||
@@ -53,6 +53,8 @@ publish-vmauth:
|
||||
|
||||
run-vmauth:
|
||||
APP_NAME=vmauth \
|
||||
DOCKER_OPTS='-v $(shell pwd)/app/vmauth/:/app/vmauth' \
|
||||
ARGS='-auth.config=app/vmauth/example_config.yml' \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
vmauth-amd64:
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
## vmauth
|
||||
|
||||
`vmauth` is a simple auth proxy and router. It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication)
|
||||
and matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
|
||||
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
|
||||
|
||||
### Quick start
|
||||
@@ -18,8 +19,12 @@ The port can be modified via `-httpListenAddr` command-line flag.
|
||||
|
||||
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
|
||||
|
||||
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
|
||||
|
||||
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
|
||||
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
|
||||
|
||||
|
||||
### Auth config
|
||||
|
||||
@@ -32,22 +37,31 @@ Auth config is represented in the following simple `yml` format:
|
||||
|
||||
users:
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8480/insert/42/prometheus"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
```
|
||||
|
||||
|
||||
@@ -96,9 +110,30 @@ Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker i
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmauth
|
||||
ROOT_IMAGE=scratch make package-vmauth
|
||||
```
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
|
||||
@@ -36,11 +36,11 @@ type UserInfo struct {
|
||||
|
||||
func initAuthConfig() {
|
||||
if len(*authConfigPath) == 0 {
|
||||
logger.Panicf("FATAL: missing required `-auth.config` command-line flag")
|
||||
logger.Fatalf("missing required `-auth.config` command-line flag")
|
||||
}
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
}
|
||||
authConfig.Store(m)
|
||||
stopCh = make(chan struct{})
|
||||
@@ -63,12 +63,14 @@ func authConfigReloader() {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-sighupCh:
|
||||
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
|
||||
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
|
||||
continue
|
||||
}
|
||||
authConfig.Store(m)
|
||||
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,11 +82,11 @@ var stopCh chan struct{}
|
||||
func readAuthConfig(path string) (map[string]*UserInfo, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read %q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot read %q: %w", path, err)
|
||||
}
|
||||
m, err := parseAuthConfig(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse %q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
|
||||
}
|
||||
logger.Infof("Loaded information about %d users from %q", len(m), path)
|
||||
return m, nil
|
||||
@@ -93,7 +95,7 @@ func readAuthConfig(path string) (map[string]*UserInfo, error) {
|
||||
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
var ac AuthConfig
|
||||
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
|
||||
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %s", err)
|
||||
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
|
||||
}
|
||||
uis := ac.Users
|
||||
if len(uis) == 0 {
|
||||
@@ -113,7 +115,7 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
// Validate urlPrefix
|
||||
target, err := url.Parse(urlPrefix)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid `url_prefix: %q`: %s", urlPrefix, err)
|
||||
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
|
||||
}
|
||||
if target.Scheme != "http" && target.Scheme != "https" {
|
||||
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
|
||||
|
||||
31
app/vmauth/example_config.yml
Normal file
31
app/vmauth/example_config.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Arbitrary number of usernames may be put here.
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
|
||||
@@ -2,9 +2,11 @@ package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
@@ -19,6 +21,9 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
@@ -55,7 +60,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
info.requests.Inc()
|
||||
|
||||
targetURL := info.URLPrefix + r.RequestURI
|
||||
targetURL := createTargetURL(info.URLPrefix, r.URL)
|
||||
if _, err := url.Parse(targetURL); err != nil {
|
||||
httpserver.Errorf(w, "Invalid targetURL=%q: %s", targetURL, err)
|
||||
return true
|
||||
@@ -74,6 +79,26 @@ var reverseProxy = &httputil.ReverseProxy{
|
||||
}
|
||||
r.URL = target
|
||||
},
|
||||
Transport: func() *http.Transport {
|
||||
tr := http.DefaultTransport.(*http.Transport).Clone()
|
||||
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
|
||||
tr.DisableCompression = true
|
||||
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
|
||||
tr.ForceAttemptHTTP2 = false
|
||||
return tr
|
||||
}(),
|
||||
FlushInterval: time.Second,
|
||||
ErrorLog: logger.StdErrorLogger(),
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
16
app/vmauth/target_url.go
Normal file
16
app/vmauth/target_url.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"path"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func createTargetURL(prefix string, u *url.URL) string {
|
||||
// Prevent from attacks with using `..` in r.URL.Path
|
||||
u.Path = path.Clean(u.Path)
|
||||
if !strings.HasPrefix(u.Path, "/") {
|
||||
u.Path = "/" + u.Path
|
||||
}
|
||||
return prefix + u.RequestURI()
|
||||
}
|
||||
26
app/vmauth/target_url_test.go
Normal file
26
app/vmauth/target_url_test.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateTargetURL(t *testing.T) {
|
||||
f := func(prefix, requestURI, expectedTarget string) {
|
||||
t.Helper()
|
||||
u, err := url.Parse(requestURI)
|
||||
if err != nil {
|
||||
t.Fatalf("cannot parse %q: %s", requestURI, err)
|
||||
}
|
||||
target := createTargetURL(prefix, u)
|
||||
if target != expectedTarget {
|
||||
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
|
||||
}
|
||||
}
|
||||
f("http://foo.bar", "", "http://foo.bar/.")
|
||||
f("http://foo.bar", "/", "http://foo.bar/")
|
||||
f("http://foo.bar", "a/b?c=d", "http://foo.bar/a/b?c=d")
|
||||
f("https://sss:3894/x/y", "/z", "https://sss:3894/x/y/z")
|
||||
f("https://sss:3894/x/y", "/../../aaa", "https://sss:3894/x/y/aaa")
|
||||
f("https://sss:3894/x/y", "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
|
||||
}
|
||||
@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
|
||||
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
|
||||
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
|
||||
creation of hourly, daily, weekly and monthly backups.
|
||||
|
||||
|
||||
### Use cases
|
||||
|
||||
@@ -86,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
|
||||
|
||||
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
|
||||
|
||||
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
|
||||
|
||||
|
||||
### How does it work?
|
||||
|
||||
@@ -118,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
|
||||
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
@@ -194,9 +201,9 @@ Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` dock
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmbackup
|
||||
ROOT_IMAGE=scratch make package-vmbackup
|
||||
```
|
||||
|
||||
@@ -31,6 +31,8 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
@@ -108,12 +110,12 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
// Verify the snapshot exists.
|
||||
f, err := os.Open(snapshotPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
|
||||
}
|
||||
fi, err := f.Stat()
|
||||
_ = f.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
|
||||
}
|
||||
if !fi.IsDir() {
|
||||
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
|
||||
@@ -124,7 +126,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -132,7 +134,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
func newDstFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*dst)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -143,7 +145,7 @@ func newOriginFS() (common.RemoteFS, error) {
|
||||
}
|
||||
fs, err := actions.NewRemoteFS(*origin)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ func (ctx *InsertCtx) AddLabel(name, value string) {
|
||||
func (ctx *InsertCtx) FlushBufs() error {
|
||||
if err := vmstorage.AddRows(ctx.mrs); err != nil {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("cannot store metrics: %s", err),
|
||||
Err: fmt.Errorf("cannot store metrics: %w", err),
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
|
||||
@@ -160,4 +161,14 @@ var (
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
_ = metrics.NewGauge(`vm_metrics_with_dropped_labels_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.MetricsWithDroppedLabels))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_names_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelNames))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_values_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelValues))
|
||||
})
|
||||
)
|
||||
|
||||
@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
|
||||
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
|
||||
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
|
||||
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
|
||||
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -52,7 +53,7 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -68,8 +69,8 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-src string
|
||||
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-storageDataPath string
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
|
||||
-version
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
@@ -97,9 +98,9 @@ Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` do
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-vmrestore
|
||||
ROOT_IMAGE=scratch make package-vmrestore
|
||||
```
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
|
||||
@@ -16,13 +17,16 @@ var (
|
||||
src = flag.String("src", "", "Source path with backup on the remote storage. "+
|
||||
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir")
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Destination path where backup must be restored. "+
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup")
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir "+
|
||||
"is synchronized with -src contents, i.e. it works like 'rsync --delete'")
|
||||
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce restore duration")
|
||||
maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum download speed. There is no limit if it is set to 0")
|
||||
skipBackupCompleteCheck = flag.Bool("skipBackupCompleteCheck", false, "Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
@@ -67,7 +71,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -75,7 +79,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
func newSrcFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*src)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %s", *src, err)
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %w", *src, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package vmselect
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -240,7 +241,8 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
statusCode := http.StatusUnprocessableEntity
|
||||
if esc, ok := err.(*httpserver.ErrorWithStatusCode); ok {
|
||||
var esc *httpserver.ErrorWithStatusCode
|
||||
if errors.As(err, &esc) {
|
||||
statusCode = esc.StatusCode
|
||||
}
|
||||
w.WriteHeader(statusCode)
|
||||
|
||||
@@ -7,13 +7,12 @@ import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
@@ -72,6 +71,50 @@ func (rss *Results) mustClose() {
|
||||
rss.sr = nil
|
||||
}
|
||||
|
||||
var timeseriesWorkCh = make(chan *timeseriesWork, gomaxprocs)
|
||||
|
||||
type timeseriesWork struct {
|
||||
rss *Results
|
||||
pts *packedTimeseries
|
||||
f func(rs *Result, workerID uint)
|
||||
doneCh chan error
|
||||
|
||||
rowsProcessed int
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go timeseriesWorker(uint(i))
|
||||
}
|
||||
}
|
||||
|
||||
func timeseriesWorker(workerID uint) {
|
||||
var rs Result
|
||||
var rsLastResetTime uint64
|
||||
for tsw := range timeseriesWorkCh {
|
||||
rss := tsw.rss
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
tsw.doneCh <- fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
continue
|
||||
}
|
||||
if err := tsw.pts.Unpack(&rs, rss.tr, rss.fetchData); err != nil {
|
||||
tsw.doneCh <- fmt.Errorf("error during time series unpacking: %w", err)
|
||||
continue
|
||||
}
|
||||
if len(rs.Timestamps) > 0 || !rss.fetchData {
|
||||
tsw.f(&rs, workerID)
|
||||
}
|
||||
tsw.rowsProcessed = len(rs.Values)
|
||||
tsw.doneCh <- nil
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
if cap(rs.Values) > 1024*1024 && 4*len(rs.Values) < cap(rs.Values) && currentTime-rsLastResetTime > 10 {
|
||||
// Reset rs in order to preseve memory usage after processing big time series with millions of rows.
|
||||
rs = Result{}
|
||||
rsLastResetTime = currentTime
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunParallel runs in parallel f for all the results from rss.
|
||||
//
|
||||
// f shouldn't hold references to rs after returning.
|
||||
@@ -81,72 +124,36 @@ func (rss *Results) mustClose() {
|
||||
func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
|
||||
defer rss.mustClose()
|
||||
|
||||
workersCount := 1 + len(rss.packedTimeseries)/32
|
||||
if workersCount > gomaxprocs {
|
||||
workersCount = gomaxprocs
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
workCh := make(chan *packedTimeseries, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers.
|
||||
rowsProcessedTotal := uint64(0)
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func(workerID uint) {
|
||||
rs := getResult()
|
||||
defer putResult(rs)
|
||||
maxWorkersCount := gomaxprocs / workersCount
|
||||
|
||||
var err error
|
||||
rowsProcessed := 0
|
||||
for pts := range workCh {
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
break
|
||||
}
|
||||
if err = pts.Unpack(rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
|
||||
break
|
||||
}
|
||||
if len(rs.Timestamps) == 0 && rss.fetchData {
|
||||
// Skip empty blocks.
|
||||
continue
|
||||
}
|
||||
rowsProcessed += len(rs.Values)
|
||||
f(rs, workerID)
|
||||
}
|
||||
atomic.AddUint64(&rowsProcessedTotal, uint64(rowsProcessed))
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}(uint(i))
|
||||
}
|
||||
|
||||
// Feed workers with work.
|
||||
tsws := make([]*timeseriesWork, len(rss.packedTimeseries))
|
||||
for i := range rss.packedTimeseries {
|
||||
workCh <- &rss.packedTimeseries[i]
|
||||
tsw := ×eriesWork{
|
||||
rss: rss,
|
||||
pts: &rss.packedTimeseries[i],
|
||||
f: f,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
timeseriesWorkCh <- tsw
|
||||
tsws[i] = tsw
|
||||
}
|
||||
seriesProcessedTotal := len(rss.packedTimeseries)
|
||||
rss.packedTimeseries = rss.packedTimeseries[:0]
|
||||
close(workCh)
|
||||
|
||||
// Wait until workers finish.
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete.
|
||||
var firstErr error
|
||||
rowsProcessedTotal := 0
|
||||
for _, tsw := range tsws {
|
||||
if err := <-tsw.doneCh; err != nil && firstErr == nil {
|
||||
// Return just the first error, since other errors
|
||||
// are likely duplicate the first error.
|
||||
firstErr = err
|
||||
}
|
||||
rowsProcessedTotal += tsw.rowsProcessed
|
||||
}
|
||||
|
||||
perQueryRowsProcessed.Update(float64(rowsProcessedTotal))
|
||||
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal))
|
||||
if len(errors) > 0 {
|
||||
// Return just the first error, since other errors
|
||||
// is likely duplicate the first error.
|
||||
return errors[0]
|
||||
}
|
||||
return nil
|
||||
return firstErr
|
||||
}
|
||||
|
||||
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`)
|
||||
@@ -159,70 +166,74 @@ type packedTimeseries struct {
|
||||
brs []storage.BlockRef
|
||||
}
|
||||
|
||||
var unpackWorkCh = make(chan *unpackWork, gomaxprocs)
|
||||
|
||||
type unpackWork struct {
|
||||
br storage.BlockRef
|
||||
tr storage.TimeRange
|
||||
fetchData bool
|
||||
doneCh chan error
|
||||
sb *sortBlock
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go unpackWorker()
|
||||
}
|
||||
}
|
||||
|
||||
func unpackWorker() {
|
||||
for upw := range unpackWorkCh {
|
||||
sb := getSortBlock()
|
||||
if err := sb.unpackFrom(upw.br, upw.tr, upw.fetchData); err != nil {
|
||||
putSortBlock(sb)
|
||||
upw.doneCh <- fmt.Errorf("cannot unpack block: %w", err)
|
||||
continue
|
||||
}
|
||||
upw.sb = sb
|
||||
upw.doneCh <- nil
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack unpacks pts to dst.
|
||||
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
|
||||
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool) error {
|
||||
dst.reset()
|
||||
|
||||
if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %s", pts.metricName, err)
|
||||
}
|
||||
|
||||
workersCount := 1 + len(pts.brs)/32
|
||||
if workersCount > maxWorkersCount {
|
||||
workersCount = maxWorkersCount
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
|
||||
sbs := make([]*sortBlock, 0, len(pts.brs))
|
||||
var sbsLock sync.Mutex
|
||||
|
||||
workCh := make(chan storage.BlockRef, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func() {
|
||||
var err error
|
||||
for br := range workCh {
|
||||
sb := getSortBlock()
|
||||
if err = sb.unpackFrom(br, tr, fetchData); err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
sbsLock.Lock()
|
||||
sbs = append(sbs, sb)
|
||||
sbsLock.Unlock()
|
||||
}
|
||||
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}()
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %w", pts.metricName, err)
|
||||
}
|
||||
|
||||
// Feed workers with work
|
||||
for _, br := range pts.brs {
|
||||
workCh <- br
|
||||
upws := make([]*unpackWork, len(pts.brs))
|
||||
for i, br := range pts.brs {
|
||||
upw := &unpackWork{
|
||||
br: br,
|
||||
tr: tr,
|
||||
fetchData: fetchData,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
unpackWorkCh <- upw
|
||||
upws[i] = upw
|
||||
}
|
||||
pts.brs = pts.brs[:0]
|
||||
close(workCh)
|
||||
|
||||
// Wait until workers finish
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete
|
||||
sbs := make([]*sortBlock, 0, len(pts.brs))
|
||||
var firstErr error
|
||||
for _, upw := range upws {
|
||||
if err := <-upw.doneCh; err != nil && firstErr == nil {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
firstErr = err
|
||||
}
|
||||
if firstErr == nil {
|
||||
sbs = append(sbs, upw.sb)
|
||||
} else {
|
||||
putSortBlock(upw.sb)
|
||||
}
|
||||
}
|
||||
if len(errors) > 0 {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
return errors[0]
|
||||
if firstErr != nil {
|
||||
return firstErr
|
||||
}
|
||||
|
||||
// Merge blocks
|
||||
mergeSortBlocks(dst, sbs)
|
||||
return nil
|
||||
}
|
||||
@@ -318,7 +329,7 @@ func (sb *sortBlock) unpackFrom(br storage.BlockRef, tr storage.TimeRange, fetch
|
||||
br.MustReadBlock(&sb.b, fetchData)
|
||||
if fetchData {
|
||||
if err := sb.b.UnmarshalData(); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal block: %s", err)
|
||||
return fmt.Errorf("cannot unmarshal block: %w", err)
|
||||
}
|
||||
}
|
||||
timestamps := sb.b.Timestamps()
|
||||
@@ -387,7 +398,7 @@ func DeleteSeries(sq *storage.SearchQuery) (int, error) {
|
||||
func GetLabels(deadline Deadline) ([]string, error) {
|
||||
labels, err := vmstorage.SearchTagKeys(*maxTagKeysPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during labels search: %s", err)
|
||||
return nil, fmt.Errorf("error during labels search: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -413,7 +424,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
// Search for tag values
|
||||
labelValues, err := vmstorage.SearchTagValues([]byte(labelName), *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %s", labelName, err)
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %w", labelName, err)
|
||||
}
|
||||
|
||||
// Sort labelValues like Prometheus does
|
||||
@@ -426,7 +437,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
labelEntries, err := vmstorage.SearchTagEntries(*maxTagKeysPerSearch, *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label entries request: %s", err)
|
||||
return nil, fmt.Errorf("error during label entries request: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -453,7 +464,7 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) {
|
||||
status, err := vmstorage.GetTSDBStatusForDate(date, topN)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during tsdb status request: %s", err)
|
||||
return nil, fmt.Errorf("error during tsdb status request: %w", err)
|
||||
}
|
||||
return status, nil
|
||||
}
|
||||
@@ -462,7 +473,7 @@ func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TS
|
||||
func GetSeriesCount(deadline Deadline) (uint64, error) {
|
||||
n, err := vmstorage.GetSeriesCount()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error during series count request: %s", err)
|
||||
return 0, fmt.Errorf("error during series count request: %w", err)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
@@ -495,6 +506,9 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
MinTimestamp: sq.MinTimestamp,
|
||||
MaxTimestamp: sq.MaxTimestamp,
|
||||
}
|
||||
if err := vmstorage.CheckTimeRange(tr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
vmstorage.WG.Add(1)
|
||||
defer vmstorage.WG.Done()
|
||||
@@ -518,7 +532,7 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
m[string(metricName)] = append(brs, *sr.MetricBlockRef.BlockRef)
|
||||
}
|
||||
if err := sr.Error(); err != nil {
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %s", blocksRead, err)
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %w", blocksRead, err)
|
||||
}
|
||||
|
||||
var rss Results
|
||||
@@ -537,25 +551,6 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
return &rss, nil
|
||||
}
|
||||
|
||||
func getResult() *Result {
|
||||
v := rsPool.Get()
|
||||
if v == nil {
|
||||
return &Result{}
|
||||
}
|
||||
return v.(*Result)
|
||||
}
|
||||
|
||||
func putResult(rs *Result) {
|
||||
if len(rs.Values) > 8192 {
|
||||
// Do not pool big results, since they may occupy too much memory.
|
||||
return
|
||||
}
|
||||
rs.reset()
|
||||
rsPool.Put(rs)
|
||||
}
|
||||
|
||||
var rsPool sync.Pool
|
||||
|
||||
func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error) {
|
||||
tfss := make([]*storage.TagFilters, 0, len(tagFilterss))
|
||||
for _, tagFilters := range tagFilterss {
|
||||
@@ -563,7 +558,7 @@ func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error)
|
||||
for i := range tagFilters {
|
||||
tf := &tagFilters[i]
|
||||
if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil {
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %s", tf, err)
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %w", tf, err)
|
||||
}
|
||||
}
|
||||
tfss = append(tfss, tfs)
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -45,7 +46,7 @@ const defaultStep = 5 * 60 * 1000
|
||||
func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
ct := currentTime()
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -81,7 +82,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer)
|
||||
@@ -104,7 +105,7 @@ func FederateHandler(startTime time.Time, w http.ResponseWriter, r *http.Request
|
||||
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
federateDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -116,7 +117,7 @@ var federateDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/fe
|
||||
func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
ct := currentTime()
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -142,7 +143,7 @@ func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
end = start + defaultStep
|
||||
}
|
||||
if err := exportHandler(w, matches, start, end, format, maxRowsPerLine, deadline); err != nil {
|
||||
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %s", matches, start, end, err)
|
||||
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %w", matches, start, end, err)
|
||||
}
|
||||
exportDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -201,7 +202,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, true, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer, runtime.GOMAXPROCS(-1))
|
||||
@@ -226,7 +227,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
}
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -236,7 +237,7 @@ func exportHandler(w http.ResponseWriter, matches []string, start, end int64, fo
|
||||
// See https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series
|
||||
func DeleteHandler(startTime time.Time, r *http.Request) error {
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse request form values: %s", err)
|
||||
return fmt.Errorf("cannot parse request form values: %w", err)
|
||||
}
|
||||
if r.FormValue("start") != "" || r.FormValue("end") != "" {
|
||||
return fmt.Errorf("start and end aren't supported. Remove these args from the query in order to delete all the matching metrics")
|
||||
@@ -254,7 +255,7 @@ func DeleteHandler(startTime time.Time, r *http.Request) error {
|
||||
}
|
||||
deletedCount, err := netstorage.DeleteSeries(sq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete time series matching %q: %s", matches, err)
|
||||
return fmt.Errorf("cannot delete time series matching %q: %w", matches, err)
|
||||
}
|
||||
if deletedCount > 0 {
|
||||
promql.ResetRollupResultCache()
|
||||
@@ -272,14 +273,14 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
|
||||
deadline := getDeadlineForQuery(r)
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
var labelValues []string
|
||||
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
|
||||
var err error
|
||||
labelValues, err = netstorage.GetLabelValues(labelName, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain label values for %q: %s`, labelName, err)
|
||||
return fmt.Errorf(`cannot obtain label values for %q: %w`, labelName, err)
|
||||
}
|
||||
} else {
|
||||
// Extended functionality that allows filtering by label filters and time range
|
||||
@@ -301,7 +302,7 @@ func LabelValuesHandler(startTime time.Time, labelName string, w http.ResponseWr
|
||||
}
|
||||
labelValues, err = labelValuesWithMatches(labelName, matches, start, end, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %s", labelName, matches, start, end, err)
|
||||
return fmt.Errorf("cannot obtain label values for %q, match[]=%q, start=%d, end=%d: %w", labelName, matches, start, end, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -342,7 +343,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
m := make(map[string]struct{})
|
||||
@@ -357,7 +358,7 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
|
||||
mLock.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error when data fetching: %s", err)
|
||||
return nil, fmt.Errorf("error when data fetching: %w", err)
|
||||
}
|
||||
|
||||
labelValues := make([]string, 0, len(m))
|
||||
@@ -375,7 +376,7 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
|
||||
deadline := getDeadlineForQuery(r)
|
||||
labelEntries, err := netstorage.GetLabelEntries(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain label entries: %s`, err)
|
||||
return fmt.Errorf(`cannot obtain label entries: %w`, err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteLabelsCountResponse(w, labelEntries)
|
||||
@@ -393,23 +394,23 @@ const secsPerDay = 3600 * 24
|
||||
func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
|
||||
deadline := getDeadlineForQuery(r)
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
date := time.Now().Unix() / secsPerDay
|
||||
date := fasttime.UnixDate()
|
||||
dateStr := r.FormValue("date")
|
||||
if len(dateStr) > 0 {
|
||||
t, err := time.Parse("2006-01-02", dateStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
|
||||
return fmt.Errorf("cannot parse `date` arg %q: %w", dateStr, err)
|
||||
}
|
||||
date = t.Unix() / secsPerDay
|
||||
date = uint64(t.Unix()) / secsPerDay
|
||||
}
|
||||
topN := 10
|
||||
topNStr := r.FormValue("topN")
|
||||
if len(topNStr) > 0 {
|
||||
n, err := strconv.Atoi(topNStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse `topN` arg %q: %s", topNStr, err)
|
||||
return fmt.Errorf("cannot parse `topN` arg %q: %w", topNStr, err)
|
||||
}
|
||||
if n <= 0 {
|
||||
n = 1
|
||||
@@ -419,9 +420,9 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
}
|
||||
topN = n
|
||||
}
|
||||
status, err := netstorage.GetTSDBStatusForDate(deadline, uint64(date), topN)
|
||||
status, err := netstorage.GetTSDBStatusForDate(deadline, date, topN)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
|
||||
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %w`, date, topN, err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteTSDBStatusResponse(w, status)
|
||||
@@ -438,14 +439,14 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
deadline := getDeadlineForQuery(r)
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
var labels []string
|
||||
if len(r.Form["match[]"]) == 0 && len(r.Form["start"]) == 0 && len(r.Form["end"]) == 0 {
|
||||
var err error
|
||||
labels, err = netstorage.GetLabels(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain labels: %s", err)
|
||||
return fmt.Errorf("cannot obtain labels: %w", err)
|
||||
}
|
||||
} else {
|
||||
// Extended functionality that allows filtering by label filters and time range
|
||||
@@ -465,7 +466,7 @@ func LabelsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
labels, err = labelsWithMatches(matches, start, end, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %s", matches, start, end, err)
|
||||
return fmt.Errorf("cannot obtain labels for match[]=%q, start=%d, end=%d: %w", matches, start, end, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -493,7 +494,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return nil, fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
m := make(map[string]struct{})
|
||||
@@ -509,7 +510,7 @@ func labelsWithMatches(matches []string, start, end int64, deadline netstorage.D
|
||||
mLock.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error when data fetching: %s", err)
|
||||
return nil, fmt.Errorf("error when data fetching: %w", err)
|
||||
}
|
||||
|
||||
labels := make([]string, 0, len(m))
|
||||
@@ -527,7 +528,7 @@ func SeriesCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
|
||||
deadline := getDeadlineForQuery(r)
|
||||
n, err := netstorage.GetSeriesCount(deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot obtain series count: %s", err)
|
||||
return fmt.Errorf("cannot obtain series count: %w", err)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
WriteSeriesCountResponse(w, n)
|
||||
@@ -544,7 +545,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
ct := currentTime()
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
return fmt.Errorf("cannot parse form values: %w", err)
|
||||
}
|
||||
matches := r.Form["match[]"]
|
||||
if len(matches) == 0 {
|
||||
@@ -579,7 +580,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
rss, err := netstorage.ProcessSearchQuery(sq, false, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot fetch data for %q: %s", sq, err)
|
||||
return fmt.Errorf("cannot fetch data for %q: %w", sq, err)
|
||||
}
|
||||
|
||||
resultsCh := make(chan *quicktemplate.ByteBuffer)
|
||||
@@ -604,7 +605,7 @@ func SeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
err = <-doneCh
|
||||
if err != nil {
|
||||
return fmt.Errorf("error during data fetching: %s", err)
|
||||
return fmt.Errorf("error during data fetching: %w", err)
|
||||
}
|
||||
seriesDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -651,17 +652,17 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
if childQuery, windowStr, offsetStr := promql.IsMetricSelectorWithRollup(query); childQuery != "" {
|
||||
window, err := parsePositiveDuration(windowStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse window: %s", err)
|
||||
return fmt.Errorf("cannot parse window: %w", err)
|
||||
}
|
||||
offset, err := parseDuration(offsetStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse offset: %s", err)
|
||||
return fmt.Errorf("cannot parse offset: %w", err)
|
||||
}
|
||||
start -= offset
|
||||
end := start
|
||||
start = end - window
|
||||
if err := exportHandler(w, []string{childQuery}, start, end, "promapi", 0, deadline); err != nil {
|
||||
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %s", childQuery, start, end, err)
|
||||
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %w", childQuery, start, end, err)
|
||||
}
|
||||
queryDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -669,24 +670,24 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
if childQuery, windowStr, stepStr, offsetStr := promql.IsRollup(query); childQuery != "" {
|
||||
newStep, err := parsePositiveDuration(stepStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse step: %s", err)
|
||||
return fmt.Errorf("cannot parse step: %w", err)
|
||||
}
|
||||
if newStep > 0 {
|
||||
step = newStep
|
||||
}
|
||||
window, err := parsePositiveDuration(windowStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse window: %s", err)
|
||||
return fmt.Errorf("cannot parse window: %w", err)
|
||||
}
|
||||
offset, err := parseDuration(offsetStr, step)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse offset: %s", err)
|
||||
return fmt.Errorf("cannot parse offset: %w", err)
|
||||
}
|
||||
start -= offset
|
||||
end := start
|
||||
start = end - window
|
||||
if err := queryRangeHandler(w, childQuery, start, end, step, r, ct); err != nil {
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", childQuery, start, end, step, err)
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", childQuery, start, end, step, err)
|
||||
}
|
||||
queryDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -701,7 +702,7 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
|
||||
}
|
||||
result, err := promql.Exec(&ec, query, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %s", query, start, step, err)
|
||||
return fmt.Errorf("error when executing query=%q for (time=%d, step=%d): %w", query, start, step, err)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
@@ -749,7 +750,7 @@ func QueryRangeHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
return err
|
||||
}
|
||||
if err := queryRangeHandler(w, query, start, end, step, r, ct); err != nil {
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %s", query, start, end, step, err)
|
||||
return fmt.Errorf("error when executing query=%q on the time range (start=%d, end=%d, step=%d): %w", query, start, end, step, err)
|
||||
}
|
||||
queryRangeDuration.UpdateDuration(startTime)
|
||||
return nil
|
||||
@@ -787,7 +788,7 @@ func queryRangeHandler(w http.ResponseWriter, query string, start, end, step int
|
||||
}
|
||||
result, err := promql.Exec(&ec, query, false)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot execute query: %s", err)
|
||||
return fmt.Errorf("cannot execute query: %w", err)
|
||||
}
|
||||
queryOffset := getLatencyOffsetMilliseconds()
|
||||
if ct-end < queryOffset {
|
||||
@@ -896,7 +897,7 @@ func getTime(r *http.Request, argKey string, defaultValue int64) (int64, error)
|
||||
// Try parsing duration relative to the current time
|
||||
d, err1 := time.ParseDuration(argValue)
|
||||
if err1 != nil {
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
|
||||
}
|
||||
if d > 0 {
|
||||
d = -d
|
||||
@@ -938,7 +939,7 @@ func getDuration(r *http.Request, argKey string, defaultValue int64) (int64, err
|
||||
// Try parsing string format
|
||||
d, err := time.ParseDuration(argValue)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
|
||||
return 0, fmt.Errorf("cannot parse %q=%q: %w", argKey, argValue, err)
|
||||
}
|
||||
secs = d.Seconds()
|
||||
}
|
||||
@@ -992,7 +993,7 @@ func getBool(r *http.Request, argKey string) bool {
|
||||
}
|
||||
|
||||
func currentTime() int64 {
|
||||
return int64(time.Now().UTC().Unix()) * 1e3
|
||||
return int64(fasttime.UnixTimestamp() * 1000)
|
||||
}
|
||||
|
||||
func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error) {
|
||||
@@ -1000,7 +1001,7 @@ func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error)
|
||||
for _, match := range matches {
|
||||
tagFilters, err := promql.ParseMetricSelector(match)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse %q: %s", match, err)
|
||||
return nil, fmt.Errorf("cannot parse %q: %w", match, err)
|
||||
}
|
||||
tagFilterss = append(tagFilterss, tagFilters)
|
||||
}
|
||||
|
||||
@@ -43,6 +43,8 @@ var aggrFuncs = map[string]aggrFunc{
|
||||
"bottomk_max": newAggrFuncRangeTopK(maxValue, true),
|
||||
"bottomk_avg": newAggrFuncRangeTopK(avgValue, true),
|
||||
"bottomk_median": newAggrFuncRangeTopK(medianValue, true),
|
||||
"any": newAggrFunc(aggrFuncAny),
|
||||
"outliersk": aggrFuncOutliersK,
|
||||
}
|
||||
|
||||
type aggrFunc func(afa *aggrFuncArg) ([]*timeseries, error)
|
||||
@@ -64,7 +66,7 @@ func newAggrFunc(afe func(tss []*timeseries) []*timeseries) aggrFunc {
|
||||
if err := expectTransformArgsNum(args, 1); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, false)
|
||||
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +82,7 @@ func removeGroupTags(metricName *storage.MetricName, modifier *metricsql.Modifie
|
||||
}
|
||||
}
|
||||
|
||||
func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, keepOriginal bool) ([]*timeseries, error) {
|
||||
func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, maxSeries int, keepOriginal bool) ([]*timeseries, error) {
|
||||
arg := copyTimeseriesMetricNames(argOrig)
|
||||
|
||||
// Perform grouping.
|
||||
@@ -92,7 +94,13 @@ func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeserie
|
||||
if keepOriginal {
|
||||
ts = argOrig[i]
|
||||
}
|
||||
m[string(bb.B)] = append(m[string(bb.B)], ts)
|
||||
tss := m[string(bb.B)]
|
||||
if tss == nil && maxSeries > 0 && len(m) >= maxSeries {
|
||||
// We already reached time series limit after grouping. Skip other time series.
|
||||
continue
|
||||
}
|
||||
tss = append(tss, ts)
|
||||
m[string(bb.B)] = tss
|
||||
}
|
||||
bbPool.Put(bb)
|
||||
|
||||
@@ -112,6 +120,10 @@ func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeserie
|
||||
return rvs, nil
|
||||
}
|
||||
|
||||
func aggrFuncAny(tss []*timeseries) []*timeseries {
|
||||
return tss[:1]
|
||||
}
|
||||
|
||||
func aggrFuncSum(tss []*timeseries) []*timeseries {
|
||||
if len(tss) == 1 {
|
||||
// Fast path - nothing to sum.
|
||||
@@ -441,7 +453,7 @@ func aggrFuncCountValues(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
return rvs
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, false)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, false)
|
||||
}
|
||||
|
||||
func newAggrFuncTopK(isReverse bool) aggrFunc {
|
||||
@@ -468,15 +480,10 @@ func newAggrFuncTopK(isReverse bool) aggrFunc {
|
||||
}
|
||||
return removeNaNs(tss)
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
}
|
||||
|
||||
type tsWithValue struct {
|
||||
ts *timeseries
|
||||
value float64
|
||||
}
|
||||
|
||||
func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggrFunc {
|
||||
return func(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
@@ -488,34 +495,42 @@ func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggr
|
||||
return nil, err
|
||||
}
|
||||
afe := func(tss []*timeseries) []*timeseries {
|
||||
maxs := make([]tsWithValue, len(tss))
|
||||
for i, ts := range tss {
|
||||
value := f(ts.Values)
|
||||
maxs[i] = tsWithValue{
|
||||
ts: ts,
|
||||
value: value,
|
||||
}
|
||||
}
|
||||
sort.Slice(maxs, func(i, j int) bool {
|
||||
a := maxs[i].value
|
||||
b := maxs[j].value
|
||||
if isReverse {
|
||||
a, b = b, a
|
||||
}
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
for i := range maxs {
|
||||
tss[i] = maxs[i].ts
|
||||
}
|
||||
for i, k := range ks {
|
||||
fillNaNsAtIdx(i, k, tss)
|
||||
}
|
||||
return removeNaNs(tss)
|
||||
return getRangeTopKTimeseries(tss, ks, f, isReverse)
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
}
|
||||
|
||||
func getRangeTopKTimeseries(tss []*timeseries, ks []float64, f func(values []float64) float64, isReverse bool) []*timeseries {
|
||||
type tsWithValue struct {
|
||||
ts *timeseries
|
||||
value float64
|
||||
}
|
||||
maxs := make([]tsWithValue, len(tss))
|
||||
for i, ts := range tss {
|
||||
value := f(ts.Values)
|
||||
maxs[i] = tsWithValue{
|
||||
ts: ts,
|
||||
value: value,
|
||||
}
|
||||
}
|
||||
sort.Slice(maxs, func(i, j int) bool {
|
||||
a := maxs[i].value
|
||||
b := maxs[j].value
|
||||
if isReverse {
|
||||
a, b = b, a
|
||||
}
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
for i := range maxs {
|
||||
tss[i] = maxs[i].ts
|
||||
}
|
||||
for i, k := range ks {
|
||||
fillNaNsAtIdx(i, k, tss)
|
||||
}
|
||||
return removeNaNs(tss)
|
||||
}
|
||||
|
||||
func fillNaNsAtIdx(idx int, k float64, tss []*timeseries) {
|
||||
if math.IsNaN(k) {
|
||||
k = 0
|
||||
@@ -577,16 +592,54 @@ func avgValue(values []float64) float64 {
|
||||
func medianValue(values []float64) float64 {
|
||||
h := histogram.GetFast()
|
||||
for _, v := range values {
|
||||
if math.IsNaN(v) {
|
||||
continue
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
h.Update(v)
|
||||
}
|
||||
value := h.Quantile(0.5)
|
||||
histogram.PutFast(h)
|
||||
return value
|
||||
}
|
||||
|
||||
func aggrFuncOutliersK(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
if err := expectTransformArgsNum(args, 2); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ks, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
afe := func(tss []*timeseries) []*timeseries {
|
||||
// Calculate medians for each point across tss.
|
||||
medians := make([]float64, len(ks))
|
||||
h := histogram.GetFast()
|
||||
for n := range ks {
|
||||
h.Reset()
|
||||
for j := range tss {
|
||||
v := tss[j].Values[n]
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
}
|
||||
medians[n] = h.Quantile(0.5)
|
||||
}
|
||||
histogram.PutFast(h)
|
||||
|
||||
// Return topK time series with the highest variance from median.
|
||||
f := func(values []float64) float64 {
|
||||
sum2 := float64(0)
|
||||
for n, v := range values {
|
||||
d := v - medians[n]
|
||||
sum2 += d * d
|
||||
}
|
||||
return sum2
|
||||
}
|
||||
return getRangeTopKTimeseries(tss, ks, f, false)
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
|
||||
func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
if err := expectTransformArgsNum(args, 2); err != nil {
|
||||
@@ -618,7 +671,7 @@ func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
return tss
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
|
||||
func aggrFuncQuantile(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
@@ -631,7 +684,7 @@ func aggrFuncQuantile(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
return nil, err
|
||||
}
|
||||
afe := newAggrQuantileFunc(phis)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, false)
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, false)
|
||||
}
|
||||
|
||||
func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
@@ -641,30 +694,24 @@ func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
phis := evalNumber(afa.ec, 0.5)[0].Values
|
||||
afe := newAggrQuantileFunc(phis)
|
||||
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, false)
|
||||
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, false)
|
||||
}
|
||||
|
||||
func newAggrQuantileFunc(phis []float64) func(tss []*timeseries) []*timeseries {
|
||||
return func(tss []*timeseries) []*timeseries {
|
||||
dst := tss[0]
|
||||
h := histogram.GetFast()
|
||||
defer histogram.PutFast(h)
|
||||
for n := range dst.Values {
|
||||
sort.Slice(tss, func(i, j int) bool {
|
||||
a := tss[i].Values[n]
|
||||
b := tss[j].Values[n]
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
h.Reset()
|
||||
for j := range tss {
|
||||
v := tss[j].Values[n]
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
}
|
||||
phi := phis[n]
|
||||
if math.IsNaN(phi) {
|
||||
phi = 1
|
||||
}
|
||||
if phi < 0 {
|
||||
phi = 0
|
||||
}
|
||||
if phi > 1 {
|
||||
phi = 1
|
||||
}
|
||||
idx := int(math.Round(float64(len(tss)-1) * phi))
|
||||
dst.Values[n] = tss[idx].Values[n]
|
||||
dst.Values[n] = h.Quantile(phi)
|
||||
}
|
||||
tss[0] = dst
|
||||
return tss[:1]
|
||||
|
||||
@@ -48,6 +48,11 @@ var incrementalAggrFuncCallbacksMap = map[string]*incrementalAggrFuncCallbacks{
|
||||
mergeAggrFunc: mergeAggrGeomean,
|
||||
finalizeAggrFunc: finalizeAggrGeomean,
|
||||
},
|
||||
"any": {
|
||||
updateAggrFunc: updateAggrAny,
|
||||
mergeAggrFunc: mergeAggrAny,
|
||||
finalizeAggrFunc: finalizeAggrCommon,
|
||||
},
|
||||
}
|
||||
|
||||
type incrementalAggrFuncContext struct {
|
||||
@@ -81,6 +86,10 @@ func (iafc *incrementalAggrFuncContext) updateTimeseries(ts *timeseries, workerI
|
||||
bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
|
||||
iac := m[string(bb.B)]
|
||||
if iac == nil {
|
||||
if iafc.ae.Limit > 0 && len(m) >= iafc.ae.Limit {
|
||||
// Skip this time series, since the limit on the number of output time series has been already reached.
|
||||
return
|
||||
}
|
||||
tsAggr := ×eries{
|
||||
Values: make([]float64, len(ts.Values)),
|
||||
Timestamps: ts.Timestamps,
|
||||
@@ -106,6 +115,10 @@ func (iafc *incrementalAggrFuncContext) finalizeTimeseries() []*timeseries {
|
||||
for k, iac := range m {
|
||||
iacGlobal := mGlobal[k]
|
||||
if iacGlobal == nil {
|
||||
if iafc.ae.Limit > 0 && len(mGlobal) >= iafc.ae.Limit {
|
||||
// Skip this time series, since the limit on the number of output time series has been already reached.
|
||||
continue
|
||||
}
|
||||
mGlobal[k] = iac
|
||||
continue
|
||||
}
|
||||
@@ -450,3 +463,25 @@ func finalizeAggrGeomean(iac *incrementalAggrContext) {
|
||||
dstValues[i] = math.Pow(dstValues[i], 1/v)
|
||||
}
|
||||
}
|
||||
|
||||
func updateAggrAny(iac *incrementalAggrContext, values []float64) {
|
||||
dstCounts := iac.values
|
||||
if dstCounts[0] > 0 {
|
||||
return
|
||||
}
|
||||
for i := range values {
|
||||
dstCounts[i] = 1
|
||||
}
|
||||
iac.ts.Values = append(iac.ts.Values[:0], values...)
|
||||
}
|
||||
|
||||
func mergeAggrAny(dst, src *incrementalAggrContext) {
|
||||
srcValues := src.ts.Values
|
||||
srcCounts := src.values
|
||||
dstCounts := dst.values
|
||||
if dstCounts[0] > 0 {
|
||||
return
|
||||
}
|
||||
dstCounts[0] = srcCounts[0]
|
||||
dst.ts.Values = append(dst.ts.Values[:0], srcValues...)
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ func testIncrementalParallelAggr(iafc *incrementalAggrFuncContext, tssSrc, tssEx
|
||||
wg.Wait()
|
||||
tssActual := iafc.finalizeTimeseries()
|
||||
if err := expectTimeseriesEqual(tssActual, tssExpected); err != nil {
|
||||
return fmt.Errorf("%s; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
|
||||
return fmt.Errorf("%w; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -164,7 +164,7 @@ func expectTsEqual(actual, expected *timeseries) error {
|
||||
return fmt.Errorf("unexpected timestamps; got %v; want %v", actual.Timestamps, expected.Timestamps)
|
||||
}
|
||||
if err := compareValues(actual.Values, expected.Values); err != nil {
|
||||
return fmt.Errorf("%s; actual %v; expected %v", err, actual.Values, expected.Values)
|
||||
return fmt.Errorf("%w; actual %v; expected %v", err, actual.Values, expected.Values)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -206,7 +206,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
|
||||
resetMetricGroupIfRequired(be, tsLeft)
|
||||
if len(tssRight) == 1 {
|
||||
// Easy case - right part contains only a single matching time series.
|
||||
tsLeft.MetricName.AddMissingTags(joinTags, &tssRight[0].MetricName)
|
||||
tsLeft.MetricName.SetTags(joinTags, &tssRight[0].MetricName)
|
||||
rvsLeft = append(rvsLeft, tsLeft)
|
||||
rvsRight = append(rvsRight, tssRight[0])
|
||||
continue
|
||||
@@ -225,7 +225,7 @@ func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft,
|
||||
for _, tsRight := range tssRight {
|
||||
var tsCopy timeseries
|
||||
tsCopy.CopyFromShallowTimestamps(tsLeft)
|
||||
tsCopy.MetricName.AddMissingTags(joinTags, &tsRight.MetricName)
|
||||
tsCopy.MetricName.SetTags(joinTags, &tsRight.MetricName)
|
||||
bb.B = marshalMetricTagsSorted(bb.B[:0], &tsCopy.MetricName)
|
||||
if tsExisting := m[string(bb.B)]; tsExisting != nil {
|
||||
// Try merging tsExisting with tsRight if they don't overlap.
|
||||
@@ -310,9 +310,22 @@ func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
|
||||
for _, tss := range mLeft {
|
||||
rvs = append(rvs, tss...)
|
||||
}
|
||||
for k, tss := range mRight {
|
||||
if mLeft[k] == nil {
|
||||
rvs = append(rvs, tss...)
|
||||
for k, tssRight := range mRight {
|
||||
tssLeft := mLeft[k]
|
||||
if tssLeft == nil {
|
||||
rvs = append(rvs, tssRight...)
|
||||
continue
|
||||
}
|
||||
// Fill gaps in tssLeft with values from tssRight as Prometheus does.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/552
|
||||
valuesRight := tssRight[0].Values
|
||||
for _, tsLeft := range tssLeft {
|
||||
valuesLeft := tsLeft.Values
|
||||
for i, v := range valuesLeft {
|
||||
if math.IsNaN(v) {
|
||||
valuesLeft[i] = valuesRight[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return rvs, nil
|
||||
|
||||
@@ -160,14 +160,14 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, me.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, me.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
if re, ok := e.(*metricsql.RollupExpr); ok {
|
||||
rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, re.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, re.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -189,7 +189,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := tf(tfa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -203,7 +203,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := evalRollupFunc(ec, fe.Name, rf, e, re, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, fe.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -240,7 +240,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := af(afa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, ae.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, ae.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -264,7 +264,7 @@ func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
|
||||
}
|
||||
rv, err := bf(bfa)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %s`, be.AppendString(nil), err)
|
||||
return nil, fmt.Errorf(`cannot evaluate %q: %w`, be.AppendString(nil), err)
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
@@ -375,7 +375,7 @@ func evalRollupFuncArgs(ec *EvalConfig, fe *metricsql.FuncExpr) ([]interface{},
|
||||
}
|
||||
ts, err := evalExpr(ec, arg)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %s", i+1, fe.AppendString(nil), err)
|
||||
return nil, nil, fmt.Errorf("cannot evaluate arg #%d for %q: %w", i+1, fe.AppendString(nil), err)
|
||||
}
|
||||
args[i] = ts
|
||||
}
|
||||
@@ -663,12 +663,17 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
|
||||
pointsPerTimeseries := 1 + (ec.End-ec.Start)/ec.Step
|
||||
timeseriesLen := rssLen
|
||||
if iafc != nil {
|
||||
// Incremental aggregates require hold only GOMAXPROCS timeseries in memory.
|
||||
// Incremental aggregates require holding only GOMAXPROCS timeseries in memory.
|
||||
timeseriesLen = runtime.GOMAXPROCS(-1)
|
||||
if iafc.ae.Modifier.Op != "" {
|
||||
// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
|
||||
// since each group can have own set of time series in memory.
|
||||
timeseriesLen *= 1000
|
||||
if iafc.ae.Limit > 0 {
|
||||
// There is an explicit limit on the number of output time series.
|
||||
timeseriesLen *= iafc.ae.Limit
|
||||
} else {
|
||||
// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
|
||||
// since each group can have own set of time series in memory.
|
||||
timeseriesLen *= 1000
|
||||
}
|
||||
}
|
||||
// The maximum number of output time series is limited by rssLen.
|
||||
if timeseriesLen > rssLen {
|
||||
|
||||
@@ -1480,7 +1480,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
|
||||
t.Run(`label_replace(nonexisting_src)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".+")`
|
||||
r := netstorage.Result{
|
||||
@@ -1491,6 +1491,21 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(mismatch)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(label_set(time(), "foo", "foobar"), "__name__", "x${1}y", "foo", "bar(.+)")`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("foobar"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`label_replace(match)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_replace(time(), "__name__", "x${1}y", "foo", ".*")`
|
||||
@@ -1849,6 +1864,17 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`scalar or scalar`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `time() > 1400 or 123`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{123, 123, 123, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`timseries-with-tags unless 2`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `label_set(time(), "foo", "bar") unless 2`
|
||||
@@ -1988,25 +2014,37 @@ func TestExecSuccess(t *testing.T) {
|
||||
})
|
||||
t.Run(`scalar * ignoring(foo) group_right vector`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(2 * ignoring(foo) group_right(a,foo) (label_set(time(), "foo", "bar") or label_set(10, "foo", "qwert")))`
|
||||
q := `sort_desc(label_set(2, "a", "2") * ignoring(foo,a) group_right(a) (label_set(time(), "foo", "bar", "a", "1"), label_set(10, "foo", "qwert")))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{2000, 2400, 2800, 3200, 3600, 4000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
}}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
{
|
||||
Key: []byte("a"),
|
||||
Value: []byte("2"),
|
||||
},
|
||||
{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
},
|
||||
}
|
||||
r2 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{20, 20, 20, 20, 20, 20},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r2.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("qwert"),
|
||||
}}
|
||||
r2.MetricName.Tags = []storage.Tag{
|
||||
{
|
||||
Key: []byte("a"),
|
||||
Value: []byte("2"),
|
||||
},
|
||||
{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("qwert"),
|
||||
},
|
||||
}
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
@@ -2321,9 +2359,9 @@ func TestExecSuccess(t *testing.T) {
|
||||
t.Run(`vector + vector on group_left matching`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(
|
||||
(label_set(time(), "t1", "v123", "t2", "v3") or label_set(10, "t2", "v3", "xxx", "yy"))
|
||||
(label_set(time(), "t1", "v123", "t2", "v3"), label_set(10, "t2", "v3", "xxx", "yy"))
|
||||
+ on (foo, t2) group_left (t1, noxxx)
|
||||
(label_set(100, "t1", "v1") or label_set(time(), "t2", "v3", "noxxx", "aa"))
|
||||
(label_set(100, "t1", "v1"), label_set(time(), "t2", "v3", "noxxx", "aa"))
|
||||
)`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
@@ -2335,10 +2373,6 @@ func TestExecSuccess(t *testing.T) {
|
||||
Key: []byte("noxxx"),
|
||||
Value: []byte("aa"),
|
||||
},
|
||||
{
|
||||
Key: []byte("t1"),
|
||||
Value: []byte("v123"),
|
||||
},
|
||||
{
|
||||
Key: []byte("t2"),
|
||||
Value: []byte("v3"),
|
||||
@@ -3496,6 +3530,21 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`sum(multi-vector) by (known-tag) limit 1`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sum(label_set(10, "foo", "bar") or label_set(time()/100, "baz", "sss")) by (foo) limit 1`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{10, 10, 10, 10, 10, 10},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`sum(multi-vector) by (known-tags)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sum(label_set(10, "foo", "bar", "baz", "sss", "x", "y") or label_set(time()/100, "baz", "sss", "foo", "bar")) by (foo, baz, foo)`
|
||||
@@ -3562,7 +3611,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `sort(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{14, 15, 12, 13, 15, 11},
|
||||
Values: []float64{14, 16, 12, 13, 15, 11},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
@@ -3592,7 +3641,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
}
|
||||
r3 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r3.MetricName.Tags = []storage.Tag{
|
||||
@@ -3613,7 +3662,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `sort(sum(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s])) by (vmrange))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{14, 15, 12, 13, 15, 11},
|
||||
Values: []float64{14, 16, 12, 13, 15, 11},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
@@ -3635,7 +3684,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
}
|
||||
r3 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r3.MetricName.Tags = []storage.Tag{
|
||||
@@ -3663,7 +3712,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `topk_max(1, histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{
|
||||
@@ -3768,6 +3817,17 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`any()`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `any(label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{10, 10, 10, 10, 10, 10},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`topk(-1)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort(topk(-1, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss")))`
|
||||
@@ -4170,14 +4230,63 @@ func TestExecSuccess(t *testing.T) {
|
||||
t.Run(`quantile(NaN)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `quantile(NaN, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
|
||||
resultExpected := []netstorage.Result{}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(0)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `outliersk(0, (
|
||||
label_set(1300, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
))`
|
||||
resultExpected := []netstorage.Result{}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(1)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `outliersk(1, (
|
||||
label_set(2000, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
))`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("baz"),
|
||||
Value: []byte("sss"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(3)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(outliersk(3, (
|
||||
label_set(1300, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
)))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("baz"),
|
||||
Value: []byte("sss"),
|
||||
}}
|
||||
r2 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1300, 1300, 1300, 1300, 1300, 1300},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r2.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`range_quantile(0.5)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `range_quantile(0.5, time())`
|
||||
@@ -5505,6 +5614,8 @@ func TestExecError(t *testing.T) {
|
||||
f(`hoeffding_bound_upper()`)
|
||||
f(`hoeffding_bound_upper(1)`)
|
||||
f(`hoeffding_bound_upper(0.99, foo, 1)`)
|
||||
f(`outliersk()`)
|
||||
f(`outliersk(1)`)
|
||||
|
||||
// Invalid argument type
|
||||
f(`median_over_time({}, 2)`)
|
||||
@@ -5544,6 +5655,7 @@ func TestExecError(t *testing.T) {
|
||||
f(`alias(1, 2)`)
|
||||
f(`aggr_over_time(1, 2)`)
|
||||
f(`aggr_over_time(("foo", "bar"), 3)`)
|
||||
f(`outliersk((label_set(1, "foo", "bar"), label_set(2, "x", "y")), 123)`)
|
||||
|
||||
// Duplicate timeseries
|
||||
f(`(label_set(1, "foo", "bar") or label_set(2, "foo", "baz"))
|
||||
|
||||
@@ -72,6 +72,8 @@ var rollupFuncs = map[string]newRollupFunc{
|
||||
"aggr_over_time": newRollupFuncTwoArgs(rollupFake),
|
||||
"hoeffding_bound_upper": newRollupHoeffdingBoundUpper,
|
||||
"hoeffding_bound_lower": newRollupHoeffdingBoundLower,
|
||||
"ascent_over_time": newRollupFuncOneArg(rollupAscentOverTime),
|
||||
"descent_over_time": newRollupFuncOneArg(rollupDescentOverTime),
|
||||
|
||||
// `timestamp` function must return timestamp for the last datapoint on the current window
|
||||
// in order to properly handle offset and timestamps unaligned to the current step.
|
||||
@@ -116,6 +118,9 @@ var rollupAggrFuncs = map[string]rollupFunc{
|
||||
"scrape_interval": rollupScrapeInterval,
|
||||
"tmin_over_time": rollupTmin,
|
||||
"tmax_over_time": rollupTmax,
|
||||
"ascent_over_time": rollupAscentOverTime,
|
||||
"descent_over_time": rollupDescentOverTime,
|
||||
"timestamp": rollupTimestamp,
|
||||
}
|
||||
|
||||
var rollupFuncsCannotAdjustWindow = map[string]bool{
|
||||
@@ -138,6 +143,8 @@ var rollupFuncsCannotAdjustWindow = map[string]bool{
|
||||
"increases_over_time": true,
|
||||
"decreases_over_time": true,
|
||||
"integrate": true,
|
||||
"ascent_over_time": true,
|
||||
"descent_over_time": true,
|
||||
}
|
||||
|
||||
var rollupFuncsRemoveCounterResets = map[string]bool{
|
||||
@@ -278,7 +285,7 @@ func getRollupConfigs(name string, rf rollupFunc, expr metricsql.Expr, start, en
|
||||
case "aggr_over_time":
|
||||
aggrFuncNames, err := getRollupAggrFuncNames(expr)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("invalid args to %s: %s", expr.AppendString(nil), err)
|
||||
return nil, nil, fmt.Errorf("invalid args to %s: %w", expr.AppendString(nil), err)
|
||||
}
|
||||
for _, aggrFuncName := range aggrFuncNames {
|
||||
if rollupFuncsRemoveCounterResets[aggrFuncName] {
|
||||
@@ -1527,6 +1534,52 @@ func rollupTimestamp(rfa *rollupFuncArg) float64 {
|
||||
return float64(timestamps[len(timestamps)-1]) / 1e3
|
||||
}
|
||||
|
||||
func rollupAscentOverTime(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
values := rfa.values
|
||||
prevValue := rfa.prevValue
|
||||
if math.IsNaN(prevValue) {
|
||||
if len(values) == 0 {
|
||||
return nan
|
||||
}
|
||||
prevValue = values[0]
|
||||
values = values[1:]
|
||||
}
|
||||
s := float64(0)
|
||||
for _, v := range values {
|
||||
d := v - prevValue
|
||||
if d > 0 {
|
||||
s += d
|
||||
}
|
||||
prevValue = v
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func rollupDescentOverTime(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
values := rfa.values
|
||||
prevValue := rfa.prevValue
|
||||
if math.IsNaN(prevValue) {
|
||||
if len(values) == 0 {
|
||||
return nan
|
||||
}
|
||||
prevValue = values[0]
|
||||
values = values[1:]
|
||||
}
|
||||
s := float64(0)
|
||||
for _, v := range values {
|
||||
d := prevValue - v
|
||||
if d > 0 {
|
||||
s += d
|
||||
}
|
||||
prevValue = v
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func rollupFirst(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
|
||||
@@ -64,18 +65,18 @@ func InitRollupResultCache(cachePath string) {
|
||||
|
||||
stats := &fastcache.Stats{}
|
||||
var statsLock sync.Mutex
|
||||
var statsLastUpdate time.Time
|
||||
var statsLastUpdate uint64
|
||||
fcs := func() *fastcache.Stats {
|
||||
statsLock.Lock()
|
||||
defer statsLock.Unlock()
|
||||
|
||||
if time.Since(statsLastUpdate) < time.Second {
|
||||
if fasttime.UnixTimestamp()-statsLastUpdate < 2 {
|
||||
return stats
|
||||
}
|
||||
var fcs fastcache.Stats
|
||||
c.UpdateStats(&fcs)
|
||||
stats = &fcs
|
||||
statsLastUpdate = time.Now()
|
||||
statsLastUpdate = fasttime.UnixTimestamp()
|
||||
return stats
|
||||
}
|
||||
if len(rollupResultCachePath) > 0 {
|
||||
@@ -285,7 +286,7 @@ var (
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
// do not use logger.Panicf, since it isn't initialized yet.
|
||||
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %s", err))
|
||||
panic(fmt.Errorf("FATAL: cannot read random data for rollupResultCacheKeyPrefix: %w", err))
|
||||
}
|
||||
return encoding.UnmarshalUint64(buf[:])
|
||||
}()
|
||||
@@ -413,7 +414,7 @@ func (mi *rollupResultCacheMetainfo) Unmarshal(src []byte) error {
|
||||
for i := 0; i < entriesLen; i++ {
|
||||
tail, err := mi.entries[i].Unmarshal(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot unmarshal entry #%d: %s", i, err)
|
||||
return fmt.Errorf("cannot unmarshal entry #%d: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
}
|
||||
|
||||
@@ -389,6 +389,9 @@ func TestRollupNewRollupFuncSuccess(t *testing.T) {
|
||||
f("ideriv", 0)
|
||||
f("decreases_over_time", 5)
|
||||
f("increases_over_time", 5)
|
||||
f("ascent_over_time", 142)
|
||||
f("descent_over_time", 231)
|
||||
f("timestamp", 0.13)
|
||||
}
|
||||
|
||||
func TestRollupNewRollupFuncError(t *testing.T) {
|
||||
|
||||
@@ -217,7 +217,7 @@ func (ts *timeseries) unmarshalFastNoTimestamps(src []byte) ([]byte, error) {
|
||||
|
||||
tail, err := unmarshalMetricNameFast(&ts.MetricName, src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricName: %s", err)
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricName: %w", err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
@@ -275,7 +275,7 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
|
||||
|
||||
tail, metricGroup, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %s", err)
|
||||
return tail, fmt.Errorf("cannot unmarshal MetricGroup: %w", err)
|
||||
}
|
||||
src = tail
|
||||
mn.MetricGroup = metricGroup[:len(metricGroup):len(metricGroup)]
|
||||
@@ -292,13 +292,13 @@ func unmarshalMetricNameFast(mn *storage.MetricName, src []byte) ([]byte, error)
|
||||
for i := range mn.Tags {
|
||||
tail, key, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %s", i, err)
|
||||
return tail, fmt.Errorf("cannot unmarshal key for tag[%d]: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
tail, value, err := unmarshalBytesFast(src)
|
||||
if err != nil {
|
||||
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %s", i, err)
|
||||
return tail, fmt.Errorf("cannot unmarshal value for tag[%d]: %w", i, err)
|
||||
}
|
||||
src = tail
|
||||
|
||||
|
||||
@@ -414,7 +414,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
les, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse le: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse le: %w", err)
|
||||
}
|
||||
|
||||
// Convert buckets with `vmrange` labels to buckets with `le` labels.
|
||||
@@ -425,7 +425,7 @@ func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
if len(args) > 2 {
|
||||
s, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
|
||||
}
|
||||
boundsLabel = s
|
||||
}
|
||||
@@ -513,7 +513,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
phis, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse phi: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse phi: %w", err)
|
||||
}
|
||||
|
||||
// Convert buckets with `vmrange` labels to buckets with `le` labels.
|
||||
@@ -524,7 +524,7 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
if len(args) > 2 {
|
||||
s, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
|
||||
return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %w", err)
|
||||
}
|
||||
boundsLabel = s
|
||||
}
|
||||
@@ -1034,7 +1034,7 @@ func transformLabelMap(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
label, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot read label name: %w", err)
|
||||
}
|
||||
srcValues, dstValues, err := getStringPairs(args[2:])
|
||||
if err != nil {
|
||||
@@ -1179,7 +1179,7 @@ func transformLabelTransform(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
|
||||
r, err := metricsql.CompileRegexp(regex)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
|
||||
}
|
||||
return labelReplace(args[0], label, r, label, replacement)
|
||||
}
|
||||
@@ -1208,7 +1208,7 @@ func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
|
||||
r, err := metricsql.CompileRegexpAnchored(regex)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
|
||||
return nil, fmt.Errorf(`cannot compile regex %q: %w`, regex, err)
|
||||
}
|
||||
return labelReplace(args[0], srcLabel, r, dstLabel, replacement)
|
||||
}
|
||||
@@ -1219,6 +1219,9 @@ func labelReplace(tss []*timeseries, srcLabel string, r *regexp.Regexp, dstLabel
|
||||
mn := &ts.MetricName
|
||||
dstValue := getDstValue(mn, dstLabel)
|
||||
srcValue := mn.GetTagValue(srcLabel)
|
||||
if !r.Match(srcValue) {
|
||||
continue
|
||||
}
|
||||
b := r.ReplaceAll(srcValue, replacementBytes)
|
||||
*dstValue = append((*dstValue)[:0], b...)
|
||||
if len(b) == 0 {
|
||||
@@ -1235,7 +1238,7 @@ func transformLabelValue(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
rvs := args[0]
|
||||
for _, ts := range rvs {
|
||||
@@ -1262,15 +1265,15 @@ func transformLabelMatch(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
labelRe, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get regexp: %s", err)
|
||||
return nil, fmt.Errorf("cannot get regexp: %w", err)
|
||||
}
|
||||
r, err := metricsql.CompileRegexpAnchored(labelRe)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
|
||||
}
|
||||
tss := args[0]
|
||||
rvs := tss[:0]
|
||||
@@ -1290,15 +1293,15 @@ func transformLabelMismatch(tfa *transformFuncArg) ([]*timeseries, error) {
|
||||
}
|
||||
labelName, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get label name: %s", err)
|
||||
return nil, fmt.Errorf("cannot get label name: %w", err)
|
||||
}
|
||||
labelRe, err := getString(args[2], 2)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot get regexp: %s", err)
|
||||
return nil, fmt.Errorf("cannot get regexp: %w", err)
|
||||
}
|
||||
r, err := metricsql.CompileRegexpAnchored(labelRe)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
|
||||
return nil, fmt.Errorf(`cannot compile regexp %q: %w`, labelRe, err)
|
||||
}
|
||||
tss := args[0]
|
||||
rvs := tss[:0]
|
||||
@@ -1398,7 +1401,7 @@ func newTransformFuncSortByLabel(isDesc bool) transformFunc {
|
||||
}
|
||||
label, err := getString(args[1], 1)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse label name for sorting: %s", err)
|
||||
return nil, fmt.Errorf("cannot parse label name for sorting: %w", err)
|
||||
}
|
||||
rvs := args[0]
|
||||
sort.SliceStable(rvs, func(i, j int) bool {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -28,8 +29,27 @@ var (
|
||||
|
||||
bigMergeConcurrency = flag.Int("bigMergeConcurrency", 0, "The maximum number of CPU cores to use for big merges. Default value is used if set to 0")
|
||||
smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of CPU cores to use for small merges. Default value is used if set to 0")
|
||||
|
||||
denyQueriesOutsideRetention = flag.Bool("denyQueriesOutsideRetention", false, "Whether to deny queries outside of the configured -retentionPeriod. "+
|
||||
"When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. "+
|
||||
"This may be useful when multiple data sources with distinct retentions are hidden behind query-tee")
|
||||
)
|
||||
|
||||
// CheckTimeRange returns true if the given tr is denied for querying.
|
||||
func CheckTimeRange(tr storage.TimeRange) error {
|
||||
if !*denyQueriesOutsideRetention {
|
||||
return nil
|
||||
}
|
||||
minAllowedTimestamp := (int64(fasttime.UnixTimestamp()) - int64(*retentionPeriod)*3600*24*30) * 1000
|
||||
if tr.MinTimestamp > minAllowedTimestamp {
|
||||
return nil
|
||||
}
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("the given time range %s is outside the allowed retention of %d months according to -denyQueriesOutsideRetention", &tr, *retentionPeriod),
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
}
|
||||
}
|
||||
|
||||
// Init initializes vmstorage.
|
||||
func Init() {
|
||||
InitWithoutMetrics()
|
||||
@@ -171,7 +191,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshotPath, err := Storage.CreateSnapshot()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot create snapshot: %s", err)
|
||||
err = fmt.Errorf("cannot create snapshot: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -185,7 +205,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshots, err := Storage.ListSnapshots()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot list snapshots: %s", err)
|
||||
err = fmt.Errorf("cannot list snapshots: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -202,7 +222,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshotName := r.FormValue("snapshot")
|
||||
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -212,13 +232,13 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
snapshots, err := Storage.ListSnapshots()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot list snapshots: %s", err)
|
||||
err = fmt.Errorf("cannot list snapshots: %w", err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
for _, snapshotName := range snapshots {
|
||||
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
|
||||
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
|
||||
jsonResponseError(w, err)
|
||||
return true
|
||||
}
|
||||
@@ -409,6 +429,16 @@ func registerStorageMetrics() {
|
||||
return float64(m().AddRowsConcurrencyCurrent)
|
||||
})
|
||||
|
||||
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
|
||||
return float64(m().SlowRowInserts)
|
||||
})
|
||||
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
|
||||
return float64(m().SlowPerDayIndexInserts)
|
||||
})
|
||||
metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
|
||||
return float64(m().SlowMetricNameLoads)
|
||||
})
|
||||
|
||||
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
|
||||
return float64(tm().BigRowsCount)
|
||||
})
|
||||
@@ -452,6 +482,9 @@ func registerStorageMetrics() {
|
||||
metrics.NewGauge(`vm_cache_entries{type="storage/hour_metric_ids"}`, func() float64 {
|
||||
return float64(m().HourMetricIDCacheSize)
|
||||
})
|
||||
metrics.NewGauge(`vm_cache_entries{type="storage/next_day_metric_ids"}`, func() float64 {
|
||||
return float64(m().NextDayMetricIDCacheSize)
|
||||
})
|
||||
metrics.NewGauge(`vm_cache_entries{type="storage/bigIndexBlocks"}`, func() float64 {
|
||||
return float64(tm().BigIndexBlocksCacheSize)
|
||||
})
|
||||
@@ -492,6 +525,9 @@ func registerStorageMetrics() {
|
||||
metrics.NewGauge(`vm_cache_size_bytes{type="storage/hour_metric_ids"}`, func() float64 {
|
||||
return float64(m().HourMetricIDCacheSizeBytes)
|
||||
})
|
||||
metrics.NewGauge(`vm_cache_size_bytes{type="storage/next_day_metric_ids"}`, func() float64 {
|
||||
return float64(m().NextDayMetricIDCacheSizeBytes)
|
||||
})
|
||||
metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/tagFilters"}`, func() float64 {
|
||||
return float64(idbm().TagCacheSizeBytes)
|
||||
})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,9 +2,9 @@
|
||||
|
||||
DOCKER_NAMESPACE := victoriametrics
|
||||
|
||||
ROOT_IMAGE ?= scratch
|
||||
CERTS_IMAGE := alpine:3.11
|
||||
GO_BUILDER_IMAGE := golang:1.14.2
|
||||
ROOT_IMAGE ?= alpine:3.12
|
||||
CERTS_IMAGE := alpine:3.12
|
||||
GO_BUILDER_IMAGE := golang:1.14.4
|
||||
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _)
|
||||
BASE_IMAGE := local/base:1.1.1-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _)
|
||||
|
||||
@@ -32,7 +32,9 @@ app-via-docker: package-base package-builder
|
||||
--env GO111MODULE=on \
|
||||
$(DOCKER_OPTS) \
|
||||
$(BUILDER_IMAGE) \
|
||||
go build $(RACE) -mod=vendor -trimpath -ldflags "-s -w -extldflags '-static' $(GO_BUILDINFO)" -tags 'netgo osusergo' \
|
||||
go build $(RACE) -mod=vendor -trimpath \
|
||||
-ldflags "-extldflags '-static' $(GO_BUILDINFO)" \
|
||||
-tags 'netgo osusergo nethttpomithttp2' \
|
||||
-o bin/$(APP_NAME)$(APP_SUFFIX)-prod $(PKG_PREFIX)/app/$(APP_NAME)
|
||||
|
||||
package-via-docker:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
#### Docker compose
|
||||
|
||||
To spin-up setup of VictoriaMetrics, Prometheus and Grafana run following command:
|
||||
To spin-up setup of VictoriaMetrics, vmagent and Grafana run following command:
|
||||
|
||||
`docker-compose up`
|
||||
|
||||
@@ -13,11 +13,11 @@ VictoriaMetrics opens following ports:
|
||||
* `--opentsdbListenAddr=:4242`
|
||||
* `--httpListenAddr=:8428`
|
||||
|
||||
##### Prometheus
|
||||
##### vmagent
|
||||
|
||||
To access service open following [link](http://localhost:9090).
|
||||
|
||||
Prometheus is already configured to use VictoriaMetrics as remote storage.
|
||||
vmagent is used for scraping and pushing timeseries to
|
||||
VictoriaMetrics instance. It accepts Prometheus-compatible
|
||||
configuration `prometheus.yml` with listed targets for scraping.
|
||||
|
||||
##### Grafana
|
||||
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: '3.5'
|
||||
services:
|
||||
prometheus:
|
||||
container_name: prometheus
|
||||
image: prom/prometheus:v2.17.2
|
||||
vmagent:
|
||||
container_name: vmagent
|
||||
image: victoriametrics/vmagent
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
- 9090:9090
|
||||
- 8429:8429
|
||||
volumes:
|
||||
- promdata:/prometheus
|
||||
- vmagentdata:/vmagentdata
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--promscrape.config=/etc/prometheus/prometheus.yml'
|
||||
- '--remoteWrite.url=http://victoriametrics:8428/api/v1/write'
|
||||
networks:
|
||||
- vm_net
|
||||
restart: always
|
||||
@@ -35,13 +35,7 @@ services:
|
||||
restart: always
|
||||
grafana:
|
||||
container_name: grafana
|
||||
image: grafana/grafana:6.7.2
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
cd /var/lib/grafana &&
|
||||
mkdir -p dashboards &&
|
||||
sed 's/$${DS_PROMETHEUS}/Prometheus/g' vm.json > dashboards/vm.json &&
|
||||
/run.sh"
|
||||
image: grafana/grafana:7.0.3
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -49,12 +43,12 @@ services:
|
||||
volumes:
|
||||
- grafanadata:/var/lib/grafana
|
||||
- ./provisioning/:/etc/grafana/provisioning/
|
||||
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/vm.json
|
||||
- ./../../dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
|
||||
networks:
|
||||
- vm_net
|
||||
restart: always
|
||||
volumes:
|
||||
promdata: {}
|
||||
vmagentdata: {}
|
||||
vmdata: {}
|
||||
grafanadata: {}
|
||||
networks:
|
||||
|
||||
@@ -1,16 +1,10 @@
|
||||
global:
|
||||
scrape_interval: 10s
|
||||
evaluation_interval: 10s
|
||||
|
||||
remote_write:
|
||||
- url: "http://victoriametrics:8428/api/v1/write"
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
- job_name: 'vmagent'
|
||||
static_configs:
|
||||
- targets: ['prometheus:9090']
|
||||
- targets: ['vmagent:8429']
|
||||
- job_name: 'victoriametrics'
|
||||
static_configs:
|
||||
- targets: ['victoriametrics:8428']
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user