mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-07 10:56:50 +03:00
Compare commits
119 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69a647b0d2 | ||
|
|
f5dd2a71a6 | ||
|
|
4b98e436ef | ||
|
|
4e8d6b80e0 | ||
|
|
d120197676 | ||
|
|
4cb3af1a36 | ||
|
|
0d92abfbf6 | ||
|
|
ff1a725a56 | ||
|
|
05ae1472e3 | ||
|
|
4fd3f6f991 | ||
|
|
6281549f31 | ||
|
|
9f4e86ac2f | ||
|
|
af49c5bdf6 | ||
|
|
a47a05dfd2 | ||
|
|
3d4008263f | ||
|
|
72ff05255f | ||
|
|
a99d606220 | ||
|
|
f8692a1d43 | ||
|
|
78b28a03b6 | ||
|
|
854f40acf2 | ||
|
|
6d059b28bf | ||
|
|
f26b94cfb6 | ||
|
|
937338abdf | ||
|
|
f2b04f2efe | ||
|
|
ff624c9125 | ||
|
|
13b1358c07 | ||
|
|
f7ff809c1e | ||
|
|
ab6e994bab | ||
|
|
d2f30e8d79 | ||
|
|
270552fde4 | ||
|
|
32652485e3 | ||
|
|
d988f89415 | ||
|
|
3d5f48ec74 | ||
|
|
0ec43cb8b0 | ||
|
|
213c3903c9 | ||
|
|
a7797dae09 | ||
|
|
d186472081 | ||
|
|
7e2669f733 | ||
|
|
ff6d093e1b | ||
|
|
8311193293 | ||
|
|
80609fdf35 | ||
|
|
d7291487be | ||
|
|
f5a4731412 | ||
|
|
947009f459 | ||
|
|
4cf7238b73 | ||
|
|
c602284a99 | ||
|
|
2f35cf13c6 | ||
|
|
b4103e055a | ||
|
|
dde29c3c18 | ||
|
|
b3fcd726e3 | ||
|
|
5b6a9675d8 | ||
|
|
aa647637bf | ||
|
|
84860167d0 | ||
|
|
0101a2d7ca | ||
|
|
d2cab369b2 | ||
|
|
8905bc2a40 | ||
|
|
f9847352b4 | ||
|
|
d1a9d8aa1c | ||
|
|
b93b01bc6d | ||
|
|
dbd8beccfa | ||
|
|
6b23df2bec | ||
|
|
619d4959c7 | ||
|
|
70ea4e28a7 | ||
|
|
74a2943030 | ||
|
|
b3ec0fb5e2 | ||
|
|
9a3afea123 | ||
|
|
7705c19720 | ||
|
|
80c18b7275 | ||
|
|
cf87b810b7 | ||
|
|
538fdfe133 | ||
|
|
f52769f6ee | ||
|
|
e6a782498a | ||
|
|
a441cdd1d9 | ||
|
|
d0f08b4a58 | ||
|
|
aae8064285 | ||
|
|
7e173655ba | ||
|
|
92212f04da | ||
|
|
de60ad0cd6 | ||
|
|
7a8ef517ae | ||
|
|
d61bac9fd9 | ||
|
|
eac3da478e | ||
|
|
0890c780c2 | ||
|
|
fd1a6ce9ae | ||
|
|
9b90c841c6 | ||
|
|
93c87d28f6 | ||
|
|
23c55181ef | ||
|
|
b19ca3eb5f | ||
|
|
4e850cd6a7 | ||
|
|
8cb35974af | ||
|
|
a0380a0a91 | ||
|
|
3a68c47de0 | ||
|
|
697b6af10f | ||
|
|
93267f143f | ||
|
|
3412d5d138 | ||
|
|
27f7cca7ff | ||
|
|
cf38c3c62f | ||
|
|
c56b66210f | ||
|
|
82ffbcb9a6 | ||
|
|
82ccdfaa91 | ||
|
|
ab8f5545bc | ||
|
|
0eacea1de1 | ||
|
|
737d641920 | ||
|
|
4fc33163c4 | ||
|
|
f9f3afb6af | ||
|
|
8b32e7c3a0 | ||
|
|
1573ececb2 | ||
|
|
a249cd9d22 | ||
|
|
ef0e37cb9e | ||
|
|
0afd48d2ee | ||
|
|
42866fa754 | ||
|
|
827a3a7866 | ||
|
|
606585f7be | ||
|
|
21598ac417 | ||
|
|
894e5d2b9b | ||
|
|
415b1ddfb5 | ||
|
|
db7dd96346 | ||
|
|
ba48438b06 | ||
|
|
7882a0dbbf | ||
|
|
4fe67504f9 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
run: |
|
||||
go get -u golang.org/x/lint/golint
|
||||
go get -u github.com/kisielk/errcheck
|
||||
go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.27.0
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
- name: Build
|
||||
|
||||
8
Makefile
8
Makefile
@@ -145,3 +145,11 @@ golangci-lint: install-golangci-lint
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
|
||||
docs-sync:
|
||||
cp app/vmagent/README.md docs/vmagent.md
|
||||
cp app/vmalert/README.md docs/vmalert.md
|
||||
cp app/vmauth/README.md docs/vmauth.md
|
||||
cp app/vmbackup/README.md docs/vmbackup.md
|
||||
cp app/vmrestore/README.md docs/vmrestore.md
|
||||
cp README.md docs/Single-server-VictoriaMetrics.md
|
||||
|
||||
63
README.md
63
README.md
@@ -10,18 +10,25 @@
|
||||
|
||||
## VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database.
|
||||
|
||||
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
|
||||
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
|
||||
|
||||
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
|
||||
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
|
||||
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
|
||||
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
|
||||
* [Zerodha](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#zerodha)
|
||||
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
|
||||
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
|
||||
* [Synthesio](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#synthesio)
|
||||
@@ -34,6 +41,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
|
||||
## Prominent features
|
||||
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
|
||||
See [these docs](#prometheus-setup) for details.
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
|
||||
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
|
||||
@@ -116,6 +125,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
* [Monitoring](#monitoring)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Backfilling](#backfilling)
|
||||
* [Replication](#replication)
|
||||
* [Backups](#backups)
|
||||
* [Profiling](#profiling)
|
||||
* [Integrations](#integrations)
|
||||
* [Third-party contributions](#third-party-contributions)
|
||||
@@ -571,11 +582,11 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of `alpine` image for improved debuggability. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `scratch` image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
@@ -763,7 +774,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -782,6 +799,8 @@ remote_write:
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
|
||||
|
||||
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
6) Set up Prometheus datasource in Grafana that points to Promxy.
|
||||
@@ -792,6 +811,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
|
||||
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
|
||||
with the enabled de-duplication. See [this section](#deduplication) for details.
|
||||
|
||||
|
||||
### Deduplication
|
||||
|
||||
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
|
||||
@@ -809,6 +829,8 @@ Data is split in per-month subdirectories inside `<-storageDataPath>/data/small`
|
||||
Directories for months outside the configured retention are deleted on the first day of new month.
|
||||
In order to keep data according to `-retentionPeriod` max disk space usage is going to be `-retentionPeriod` + 1 month.
|
||||
For example if `-retentionPeriod` is set to 1, data for January is deleted on March 1st.
|
||||
It is safe to extend `-retentionPeriod` on existing data. If `-retentionPeriod` is set to lower
|
||||
value than before then data outside the configured period will be eventually deleted.
|
||||
|
||||
### Multiple retentions
|
||||
|
||||
@@ -874,6 +896,10 @@ Consider setting the following command-line flags:
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
|
||||
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md)
|
||||
or similar auth proxy.
|
||||
|
||||
|
||||
### Tuning
|
||||
|
||||
* There is no need for VictoriaMetrics tuning since it uses reasonable defaults for command-line flags,
|
||||
@@ -910,6 +936,12 @@ The most interesting metrics are:
|
||||
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
|
||||
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
|
||||
* `sum(vm_data_size_bytes)` - the total size of data on disk.
|
||||
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -922,8 +954,9 @@ The most interesting metrics are:
|
||||
|
||||
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
|
||||
then it is likely you have too many active time series for the current amount of RAM.
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
|
||||
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
ingestion performance.
|
||||
ingestion and query performance in this case.
|
||||
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
|
||||
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
|
||||
|
||||
@@ -970,6 +1003,24 @@ the query cache, which could contain incomplete data cached during the backfilli
|
||||
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
|
||||
for data with timestamps close to the current time.
|
||||
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
|
||||
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
|
||||
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
@@ -25,6 +26,8 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
@@ -52,8 +52,9 @@ publish-vmagent:
|
||||
APP_NAME=vmagent $(MAKE) publish-via-docker
|
||||
|
||||
run-vmagent:
|
||||
mkdir -p vmagent-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-data:/vmagent-data' \
|
||||
mkdir -p vmagent-remotewrite-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-remotewrite-data:/vmagent-remotewrite-data' \
|
||||
ARGS='-remoteWrite.url=http://localhost:8428/api/v1/write' \
|
||||
APP_NAME=vmagent \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
## vmagent
|
||||
|
||||
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
|
||||
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
|
||||
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
|
||||
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
@@ -11,7 +11,7 @@ or any other Prometheus-compatible storage system that supports `remote_write` p
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
|
||||
Also, we found that users’ infrastructure is like snowflakes - never alike, and we decided to add more flexibility
|
||||
Also, we found that users’ infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
|
||||
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
|
||||
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
|
||||
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
|
||||
|
||||
|
||||
### Quick Start
|
||||
@@ -40,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
|
||||
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
|
||||
in order to replicate data concurrently to multiple remote storage systems.
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
|
||||
@@ -49,7 +48,7 @@ Example command line:
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
If you need collecting only Influx data, then the following command line would be enough:
|
||||
If you only need to collect Influx data, then the following is sufficient:
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
@@ -79,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
|
||||
#### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
#### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
|
||||
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
|
||||
|
||||
@@ -94,13 +93,13 @@ Then it sends the buffered data to the remote storage in order to prevent data g
|
||||
#### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
|
||||
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
See [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
#### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
@@ -148,6 +147,10 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
|
||||
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
|
||||
|
||||
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
|
||||
|
||||
|
||||
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
|
||||
@@ -200,12 +203,12 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
|
||||
|
||||
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
|
||||
|
||||
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
|
||||
### How to build from sources
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -39,6 +40,8 @@ var (
|
||||
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
|
||||
"Usually :4242 must be set. Doesn't work if empty")
|
||||
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
|
||||
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . See also -promscrape.config.dryRun")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -49,9 +52,27 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
if *dryRun {
|
||||
if err := flag.Set("promscrape.config.strictParse", "true"); err != nil {
|
||||
logger.Panicf("BUG: cannot set promscrape.config.strictParse=true: %s", err)
|
||||
}
|
||||
if err := remotewrite.CheckRelabelConfigs(); err != nil {
|
||||
logger.Fatalf("error when checking relabel configs: %s", err)
|
||||
}
|
||||
if err := promscrape.CheckConfig(); err != nil {
|
||||
logger.Fatalf("error when checking Prometheus config: %s", err)
|
||||
}
|
||||
logger.Infof("all the configs are ok; exitting with 0 status code")
|
||||
return
|
||||
}
|
||||
|
||||
logger.Infof("starting vmagent at %q...", *httpListenAddr)
|
||||
startTime := time.Now()
|
||||
remotewrite.Init()
|
||||
@@ -185,3 +206,15 @@ var (
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
)
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentq
|
||||
token := bearerToken.GetOptionalArg(argIdx)
|
||||
if len(token) > 0 {
|
||||
if authHeader != "" {
|
||||
logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
|
||||
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
|
||||
}
|
||||
authHeader = "Bearer " + token
|
||||
}
|
||||
@@ -85,11 +85,11 @@ func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentq
|
||||
switch scheme {
|
||||
case "http", "https":
|
||||
default:
|
||||
logger.Panicf("FATAL: unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
logger.Fatalf("unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
}
|
||||
host := string(u.Host())
|
||||
if len(host) == 0 {
|
||||
logger.Panicf("FATAL: invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
logger.Fatalf("invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
}
|
||||
requestURI := string(u.RequestURI())
|
||||
isTLS := scheme == "https"
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -15,7 +15,8 @@ import (
|
||||
|
||||
var (
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
|
||||
"Minimum supported interval is 1 second")
|
||||
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
|
||||
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
|
||||
)
|
||||
@@ -55,6 +56,10 @@ func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
|
||||
}
|
||||
|
||||
func (ps *pendingSeries) periodicFlusher() {
|
||||
flushSeconds := int64(flushInterval.Seconds())
|
||||
if flushSeconds <= 0 {
|
||||
flushSeconds = 1
|
||||
}
|
||||
ticker := time.NewTicker(*flushInterval)
|
||||
defer ticker.Stop()
|
||||
mustStop := false
|
||||
@@ -63,7 +68,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
case <-ps.stopCh:
|
||||
mustStop = true
|
||||
case <-ticker.C:
|
||||
if time.Since(ps.wr.lastFlushTime) < *flushInterval/2 {
|
||||
if fasttime.UnixTimestamp()-ps.wr.lastFlushTime < uint64(flushSeconds) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -76,7 +81,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
type writeRequest struct {
|
||||
wr prompbmarshal.WriteRequest
|
||||
pushBlock func(block []byte)
|
||||
lastFlushTime time.Time
|
||||
lastFlushTime uint64
|
||||
|
||||
tss []prompbmarshal.TimeSeries
|
||||
|
||||
@@ -108,7 +113,7 @@ func (wr *writeRequest) reset() {
|
||||
|
||||
func (wr *writeRequest) flush() {
|
||||
wr.wr.Timeseries = wr.tss
|
||||
wr.lastFlushTime = time.Now()
|
||||
wr.lastFlushTime = fasttime.UnixTimestamp()
|
||||
pushWriteRequest(&wr.wr, wr.pushBlock)
|
||||
wr.reset()
|
||||
}
|
||||
@@ -144,13 +149,8 @@ func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
|
||||
}
|
||||
dst.Labels = labelsDst[labelsLen:]
|
||||
|
||||
samplesDst = append(samplesDst, prompbmarshal.Sample{})
|
||||
dstSample := &samplesDst[len(samplesDst)-1]
|
||||
if len(src.Samples) != 1 {
|
||||
logger.Panicf("BUG: unexpected number of samples in time series; got %d; want 1", len(src.Samples))
|
||||
}
|
||||
*dstSample = src.Samples[0]
|
||||
dst.Samples = samplesDst[len(samplesDst)-1:]
|
||||
samplesDst = append(samplesDst, src.Samples...)
|
||||
dst.Samples = samplesDst[len(samplesDst)-len(src.Samples):]
|
||||
|
||||
wr.samples = samplesDst
|
||||
wr.labels = labelsDst
|
||||
|
||||
@@ -2,6 +2,7 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
@@ -16,35 +17,60 @@ var (
|
||||
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
|
||||
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
|
||||
"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
var labelsGlobal []prompbmarshal.Label
|
||||
var prcsGlobal []promrelabel.ParsedRelabelConfig
|
||||
|
||||
// initRelabelGlobal must be called after parsing command-line flags.
|
||||
func initRelabelGlobal() {
|
||||
// CheckRelabelConfigs checks -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig.
|
||||
func CheckRelabelConfigs() error {
|
||||
_, err := loadRelabelConfigs()
|
||||
return err
|
||||
}
|
||||
|
||||
func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
var rcs relabelConfigs
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
rcs.global = global
|
||||
}
|
||||
if len(*relabelConfigPaths) > len(*remoteWriteURLs) {
|
||||
return nil, fmt.Errorf("too many -remoteWrite.urlRelabelConfig args: %d; it mustn't exceed the number of -remoteWrite.url args: %d",
|
||||
len(*relabelConfigPaths), len(*remoteWriteURLs))
|
||||
}
|
||||
rcs.perURL = make([][]promrelabel.ParsedRelabelConfig, len(*remoteWriteURLs))
|
||||
for i, path := range *relabelConfigPaths {
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", path, err)
|
||||
}
|
||||
rcs.perURL[i] = prc
|
||||
}
|
||||
return &rcs, nil
|
||||
}
|
||||
|
||||
type relabelConfigs struct {
|
||||
global []promrelabel.ParsedRelabelConfig
|
||||
perURL [][]promrelabel.ParsedRelabelConfig
|
||||
}
|
||||
|
||||
// initLabelsGlobal must be called after parsing command-line flags.
|
||||
func initLabelsGlobal() {
|
||||
// Init labelsGlobal
|
||||
labelsGlobal = nil
|
||||
for _, s := range *unparsedLabelsGlobal {
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
logger.Panicf("FATAL: missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
logger.Fatalf("missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
labelsGlobal = append(labelsGlobal, prompbmarshal.Label{
|
||||
Name: s[:n],
|
||||
Value: s[n+1:],
|
||||
})
|
||||
}
|
||||
|
||||
// Init prcsGlobal
|
||||
prcsGlobal = nil
|
||||
if len(*relabelConfigPathGlobal) > 0 {
|
||||
var err error
|
||||
prcsGlobal, err = promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, prcs []promrelabel.ParsedRelabelConfig) []prompbmarshal.TimeSeries {
|
||||
|
||||
@@ -3,6 +3,7 @@ package remotewrite
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
@@ -10,8 +11,8 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
xxhash "github.com/cespare/xxhash/v2"
|
||||
)
|
||||
@@ -20,9 +21,8 @@ var (
|
||||
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
|
||||
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
|
||||
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
"isn't enough for sending high volume of collected data to remote storage")
|
||||
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensistive auth info")
|
||||
@@ -34,6 +34,9 @@ var (
|
||||
|
||||
var rwctxs []*remoteWriteCtx
|
||||
|
||||
// Contains the current relabelConfigs.
|
||||
var allRelabelConfigs atomic.Value
|
||||
|
||||
// Init initializes remotewrite.
|
||||
//
|
||||
// It must be called after flag.Parse().
|
||||
@@ -41,14 +44,19 @@ var rwctxs []*remoteWriteCtx
|
||||
// Stop must be called for graceful shutdown.
|
||||
func Init() {
|
||||
if len(*remoteWriteURLs) == 0 {
|
||||
logger.Panicf("FATAL: at least one `-remoteWrite.url` must be set")
|
||||
logger.Fatalf("at least one `-remoteWrite.url` must be set")
|
||||
}
|
||||
|
||||
if !*showRemoteWriteURL {
|
||||
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
|
||||
httpserver.RegisterSecretFlag("remoteWrite.url")
|
||||
}
|
||||
initRelabelGlobal()
|
||||
initLabelsGlobal()
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load relabel configs: %s", err)
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
|
||||
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
|
||||
if maxInmemoryBlocks > 200 {
|
||||
@@ -61,23 +69,47 @@ func Init() {
|
||||
maxInmemoryBlocks = 2
|
||||
}
|
||||
for i, remoteWriteURL := range *remoteWriteURLs {
|
||||
relabelConfigPath := ""
|
||||
if i < len(*relabelConfigPaths) {
|
||||
relabelConfigPath = (*relabelConfigPaths)[i]
|
||||
}
|
||||
urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
|
||||
if *showRemoteWriteURL {
|
||||
urlLabelValue = remoteWriteURL
|
||||
}
|
||||
rwctx := newRemoteWriteCtx(i, remoteWriteURL, relabelConfigPath, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctxs = append(rwctxs, rwctx)
|
||||
}
|
||||
|
||||
// Start config reloader.
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
configReloaderWG.Add(1)
|
||||
go func() {
|
||||
defer configReloaderWG.Done()
|
||||
for {
|
||||
select {
|
||||
case <-sighupCh:
|
||||
case <-stopCh:
|
||||
return
|
||||
}
|
||||
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
|
||||
continue
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
logger.Infof("Successfully reloaded relabel configs")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var stopCh = make(chan struct{})
|
||||
var configReloaderWG sync.WaitGroup
|
||||
|
||||
// Stop stops remotewrite.
|
||||
//
|
||||
// It is expected that nobody calls Push during and after the call to this func.
|
||||
func Stop() {
|
||||
close(stopCh)
|
||||
configReloaderWG.Wait()
|
||||
|
||||
for _, rwctx := range rwctxs {
|
||||
rwctx.MustStop()
|
||||
}
|
||||
@@ -86,11 +118,11 @@ func Stop() {
|
||||
|
||||
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
|
||||
//
|
||||
// Each timeseries in wr.Timeseries must contain one sample.
|
||||
//
|
||||
// Note that wr may be modified by Push due to relabeling.
|
||||
func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var rctx *relabelCtx
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcsGlobal := rcs.global
|
||||
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
|
||||
rctx = getRelabelCtx()
|
||||
}
|
||||
@@ -124,9 +156,9 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
|
||||
|
||||
type remoteWriteCtx struct {
|
||||
idx int
|
||||
fq *persistentqueue.FastQueue
|
||||
c *client
|
||||
prcs []promrelabel.ParsedRelabelConfig
|
||||
pss []*pendingSeries
|
||||
pssNextIdx uint64
|
||||
|
||||
@@ -135,7 +167,7 @@ type remoteWriteCtx struct {
|
||||
relabelMetricsDropped *metrics.Counter
|
||||
}
|
||||
|
||||
func newRemoteWriteCtx(argIdx int, remoteWriteURL, relabelConfigPath string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
h := xxhash.Sum64([]byte(remoteWriteURL))
|
||||
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
|
||||
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks, *maxPendingBytesPerURL)
|
||||
@@ -146,23 +178,15 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL, relabelConfigPath string, max
|
||||
return float64(fq.GetInmemoryQueueLen())
|
||||
})
|
||||
c := newClient(argIdx, remoteWriteURL, urlLabelValue, fq, *queues)
|
||||
var prcs []promrelabel.ParsedRelabelConfig
|
||||
if len(relabelConfigPath) > 0 {
|
||||
var err error
|
||||
prcs, err = promrelabel.LoadRelabelConfigs(relabelConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", relabelConfigPath, err)
|
||||
}
|
||||
}
|
||||
pss := make([]*pendingSeries, *queues)
|
||||
for i := range pss {
|
||||
pss[i] = newPendingSeries(fq.MustWriteBlock)
|
||||
}
|
||||
return &remoteWriteCtx{
|
||||
fq: fq,
|
||||
c: c,
|
||||
prcs: prcs,
|
||||
pss: pss,
|
||||
idx: argIdx,
|
||||
fq: fq,
|
||||
c: c,
|
||||
pss: pss,
|
||||
|
||||
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, urlLabelValue)),
|
||||
}
|
||||
@@ -172,10 +196,10 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
for _, ps := range rwctx.pss {
|
||||
ps.MustStop()
|
||||
}
|
||||
rwctx.idx = 0
|
||||
rwctx.pss = nil
|
||||
rwctx.fq.MustClose()
|
||||
rwctx.fq = nil
|
||||
rwctx.prcs = nil
|
||||
rwctx.c.MustStop()
|
||||
rwctx.c = nil
|
||||
|
||||
@@ -184,7 +208,9 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
|
||||
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
var rctx *relabelCtx
|
||||
if len(rwctx.prcs) > 0 {
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcs := rcs.perURL[rwctx.idx]
|
||||
if len(prcs) > 0 {
|
||||
// Make a copy of tss before applying relabeling in order to prevent
|
||||
// from affecting time series for other remoteWrite.url configs.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467 for details.
|
||||
@@ -192,7 +218,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
tss = rwctx.tss
|
||||
rctx = getRelabelCtx()
|
||||
tssLen := len(tss)
|
||||
tss = rctx.applyRelabeling(tss, nil, rwctx.prcs)
|
||||
tss = rctx.applyRelabeling(tss, nil, prcs)
|
||||
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
|
||||
}
|
||||
pss := rwctx.pss
|
||||
|
||||
@@ -52,14 +52,17 @@ publish-vmalert:
|
||||
APP_NAME=vmalert $(MAKE) publish-via-docker
|
||||
|
||||
test-vmalert:
|
||||
go test -race -cover ./app/vmalert
|
||||
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
||||
go test -v -race -cover ./app/vmalert/datasource
|
||||
go test -v -race -cover ./app/vmalert/notifier
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093 \
|
||||
-remotewrite.url=http://localhost:8428 \
|
||||
-remoteread.url=http://localhost:8428 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-evaluationInterval=3s
|
||||
|
||||
vmalert-amd64:
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
`vmalert` executes a list of given MetricsQL expressions (rules) and
|
||||
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
expressions validation;
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### TODO:
|
||||
* Configuration hot reload.
|
||||
|
||||
### QuickStart
|
||||
|
||||
To build `vmalert` from sources:
|
||||
@@ -26,9 +24,9 @@ make vmalert
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of alert rules - PromQL/MetricsQL expressions to execute;
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable Alertmanager instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
@@ -38,23 +36,28 @@ Then configure `vmalert` accordingly:
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata).
|
||||
|
||||
`vmalert` may be configured with `-remoteWrite` flag to write recording rules and
|
||||
alerts state in form of timeseries via remote write protocol. Alerts state will be written
|
||||
as `ALERTS` timeseries. These timeseries may be used to recover alerts state on `vmalert`
|
||||
restarts if `-remoteRead` is configured.
|
||||
|
||||
`vmalert` runs evaluation for every group in a separate goroutine.
|
||||
Rules in group evaluated one-by-one sequentially.
|
||||
|
||||
**Important:** while recording rules execution is sequential, writing of timeseries results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one.
|
||||
|
||||
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
|
||||
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
|
||||
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
|
||||
|
||||
|
||||
### Configuration
|
||||
|
||||
The shortlist of configuration flags is the following:
|
||||
@@ -66,35 +69,35 @@ Usage of vmalert:
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules. Default 1m (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteread.basicAuth.password string
|
||||
Optional basic auth password for -remoteread.url
|
||||
-remoteread.basicAuth.username string
|
||||
Optional basic auth username for -remoteread.url
|
||||
-remoteread.lookback duration
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteread.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remotewrite.basicAuth.password string
|
||||
Optional basic auth password for -remotewrite.url
|
||||
-remotewrite.basicAuth.username string
|
||||
Optional basic auth username for -remotewrite.url
|
||||
-remotewrite.url string
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
Path to the file with alert rules.
|
||||
@@ -118,3 +121,22 @@ or send GET request to `/-/reload` endpoint.
|
||||
`vmalert` is mostly designed and built by VictoriaMetrics community.
|
||||
Feel free to share your experience and ideas for improving this
|
||||
software. Please keep simplicity as the main priority.
|
||||
|
||||
### How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert` from the root folder of the repository.
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-prod` from the root folder of the repository.
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
376
app/vmalert/alerting.go
Normal file
376
app/vmalert/alerting.go
Normal file
@@ -0,0 +1,376 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For,
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: gID,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
}
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (ar *AlertingRule) String() string {
|
||||
return ar.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (ar *AlertingRule) ID() uint64 {
|
||||
hash := fnv.New64a()
|
||||
hash.Write([]byte("alerting"))
|
||||
hash.Write([]byte("\xff"))
|
||||
hash.Write([]byte(ar.Name))
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := q.Query(ctx, ar.Expr)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
ar.lastExecError = err
|
||||
ar.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %s", ar.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(ar.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = ar.template(a)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ar.lastExecTime)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
if a.State == notifier.StatePending {
|
||||
// alert was in Pending state - it is not
|
||||
// active anymore
|
||||
delete(ar.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
if series {
|
||||
return ar.toTimeSeries(ar.lastExecTime), nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
ts := ar.alertToTimeSeries(a, timestamp)
|
||||
tss = append(tss, ts...)
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, ar.template(a)
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(ar.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(ar.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIAlertingRule
|
||||
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
||||
var lastErr string
|
||||
if ar.lastExecError != nil {
|
||||
lastErr = ar.lastExecError.Error()
|
||||
}
|
||||
return APIAlertingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
Name: ar.Name,
|
||||
Expression: ar.Expr,
|
||||
For: ar.For.String(),
|
||||
LastError: lastErr,
|
||||
LastExec: ar.lastExecTime,
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
}
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
|
||||
if ar.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Exec
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := ar.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -11,30 +10,15 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test{"}).Validate(); err == nil {
|
||||
t.Errorf("exptected invalid expr error")
|
||||
}
|
||||
if err := (&Rule{Name: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
alert *notifier.Alert
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
newTestRule("instant", 0),
|
||||
newTestAlertingRule("instant", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -45,7 +29,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant extra labels", 0),
|
||||
newTestAlertingRule("instant extra labels", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
@@ -61,7 +45,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("instant labels override", 0),
|
||||
newTestAlertingRule("instant labels override", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
alertStateLabel: "foo",
|
||||
"__name__": "bar",
|
||||
@@ -75,7 +59,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for", time.Second),
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -90,7 +74,7 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for pending", 10*time.Second),
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
@@ -107,58 +91,28 @@ func TestRule_AlertToTimeSeries(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
tss := tc.rule.AlertToTimeSeries(tc.alert, timestamp)
|
||||
if len(tc.expTS) != len(tss) {
|
||||
t.Fatalf("expected number of timeseries %d; got %d", len(tc.expTS), len(tss))
|
||||
}
|
||||
for i := range tc.expTS {
|
||||
expTS, gotTS := tc.expTS[i], tss[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
t.Fatalf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
if got.Timestamp != exp.Timestamp {
|
||||
t.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
t.Fatalf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
t.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
t.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
tc.rule.alerts[tc.alert.ID] = tc.alert
|
||||
tss := tc.rule.toTimeSeries(timestamp)
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRule(name string, waitFor time.Duration) *Rule {
|
||||
return &Rule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
|
||||
func TestRule_Exec(t *testing.T) {
|
||||
func TestAlertingRule_Exec(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
steps [][]datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestRule("empty", 0),
|
||||
newTestAlertingRule("empty", 0),
|
||||
[][]datasource.Metric{},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("empty labels", 0),
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[][]datasource.Metric{
|
||||
{datasource.Metric{}},
|
||||
},
|
||||
@@ -167,7 +121,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing", 0),
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -176,7 +130,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -186,7 +140,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -197,7 +151,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -209,7 +163,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -220,7 +174,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -234,7 +188,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-firing", 0),
|
||||
newTestAlertingRule("multiple-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
metricWithLabels(t, "name", "foo"),
|
||||
@@ -249,7 +203,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("multiple-steps-firing", 0),
|
||||
newTestAlertingRule("multiple-steps-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo1")},
|
||||
@@ -264,7 +218,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("duplicate", 0),
|
||||
newTestAlertingRule("duplicate", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
// metrics with the same labelset should result in one alert
|
||||
@@ -277,7 +231,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending", time.Minute),
|
||||
newTestAlertingRule("for-pending", time.Minute),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
@@ -286,7 +240,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-fired", time.Millisecond),
|
||||
newTestAlertingRule("for-fired", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -296,7 +250,17 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>inactive", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>empty", time.Second),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset and delete pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -308,22 +272,10 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
//{metricWithLabels(t, "name", "foo")},
|
||||
//{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -333,7 +285,7 @@ func TestRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
@@ -351,11 +303,11 @@ func TestRule_Exec(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if err := tc.rule.Exec(context.TODO(), fq); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
@@ -377,49 +329,9 @@ func TestRule_Exec(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
cpy := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cpy, fq.metrics)
|
||||
fq.Unlock()
|
||||
return cpy, nil
|
||||
}
|
||||
|
||||
func TestRule_Restore(t *testing.T) {
|
||||
func TestAlertingRule_Restore(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *Rule
|
||||
rule *AlertingRule
|
||||
metrics []datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
@@ -504,7 +416,7 @@ func TestRule_Restore(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.group = fakeGroup
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
@@ -528,8 +440,8 @@ func TestRule_Restore(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
r := newTestRule(name, 0)
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
r.Labels[labels[i]] = labels[i+1]
|
||||
@@ -537,9 +449,6 @@ func newTestRuleWithLabels(name string, labels ...string) *Rule {
|
||||
return r
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
@@ -1,73 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file patther %s:%v", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
groupsNames := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("file %s: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if _, ok := groupsNames[g.Name]; ok {
|
||||
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", g.Name, file)
|
||||
}
|
||||
g.File = file
|
||||
g.done = make(chan struct{})
|
||||
g.finished = make(chan struct{})
|
||||
|
||||
groupsNames[g.Name] = struct{}{}
|
||||
for _, rule := range g.Rules {
|
||||
if err = rule.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid rule filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
if validateAnnotations {
|
||||
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
|
||||
return nil, fmt.Errorf("invalid annotations filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
|
||||
return nil, fmt.Errorf("invalid labels filepath: %s, group %s: %w", file, g.Name, err)
|
||||
}
|
||||
}
|
||||
rule.group = g
|
||||
rule.alerts = make(map[uint64]*notifier.Alert)
|
||||
}
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
return g.Groups, err
|
||||
}
|
||||
147
app/vmalert/config/config.go
Normal file
147
app/vmalert/config/config.go
Normal file
@@ -0,0 +1,147 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// Group contains list of Rules grouped into
|
||||
// entity with one name and evaluation interval
|
||||
type Group struct {
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// Validate check for internal Group or Rule configuration errors
|
||||
func (g *Group) Validate(validateAnnotations bool) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("group name must be set")
|
||||
}
|
||||
if len(g.Rules) == 0 {
|
||||
return fmt.Errorf("group %q can't contain no rules", g.Name)
|
||||
}
|
||||
uniqueRules := map[string]struct{}{}
|
||||
for _, r := range g.Rules {
|
||||
ruleName := r.Record
|
||||
if r.Alert != "" {
|
||||
ruleName = r.Alert
|
||||
}
|
||||
if _, ok := uniqueRules[ruleName]; ok {
|
||||
return fmt.Errorf("rule name %q duplicate", ruleName)
|
||||
}
|
||||
uniqueRules[ruleName] = struct{}{}
|
||||
if err := r.Validate(); err != nil {
|
||||
return fmt.Errorf("invalid rule %q.%q: %s", g.Name, ruleName, err)
|
||||
}
|
||||
if !validateAnnotations {
|
||||
continue
|
||||
}
|
||||
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
|
||||
return fmt.Errorf("invalid annotations for rule %q.%q: %s", g.Name, ruleName, err)
|
||||
}
|
||||
if err := notifier.ValidateTemplates(r.Labels); err != nil {
|
||||
return fmt.Errorf("invalid labels for rule %q.%q: %s", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
|
||||
}
|
||||
|
||||
// Rule describes entity that represent either
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
}
|
||||
|
||||
// Validate check for Rule configuration errors
|
||||
func (r *Rule) Validate() error {
|
||||
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
|
||||
return fmt.Errorf("either `record` or `alert` must be set")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression can't be empty")
|
||||
}
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file pattern %s: %v", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
uniqueGroups := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if err := g.Validate(validateAnnotations); err != nil {
|
||||
return nil, fmt.Errorf("invalid group %q in file %q: %s", g.Name, file, err)
|
||||
}
|
||||
if _, ok := uniqueGroups[g.Name]; ok {
|
||||
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
|
||||
}
|
||||
uniqueGroups[g.Name] = struct{}{}
|
||||
g.File = file
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return g.Groups, checkOverflow(g.XXX, "config")
|
||||
}
|
||||
|
||||
func checkOverflow(m map[string]interface{}, ctx string) error {
|
||||
if len(m) > 0 {
|
||||
var keys []string
|
||||
for k := range m {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
83
app/vmalert/config/config_test.go
Normal file
83
app/vmalert/config/config_test.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
testCases := []struct {
|
||||
path []string
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
[]string{"testdata/rules0-bad.rules"},
|
||||
"unexpected token",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules0-bad.rules"},
|
||||
"error parsing annotation",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules1-bad.rules"},
|
||||
"duplicate in file",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules2-bad.rules"},
|
||||
"function \"value\" not defined",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules3-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules4-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/*.yaml"},
|
||||
"no groups found",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
_, err := Parse(tc.path, true)
|
||||
if err == nil {
|
||||
t.Errorf("expected to get error")
|
||||
return
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert", Expr: "test{"}).Validate(); err == nil {
|
||||
t.Errorf("exptected invalid expr error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule got %s", err)
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ groups:
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
expr: "{{ $expr|queryEscape }}"
|
||||
annotations:
|
||||
summary: "{{ $value|humanize }}"
|
||||
description: "{{$labels}}"
|
||||
@@ -9,5 +9,3 @@ groups:
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
|
||||
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
record: record
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
expr: vm_rows > 0
|
||||
- record: rows
|
||||
expr: sum(vm_rows)
|
||||
28
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
28
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
groups:
|
||||
- name: TestGroup
|
||||
interval: 2s
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
for: 3m
|
||||
annotations:
|
||||
summary: "Too high connection number for {{$labels.instance}}"
|
||||
description: "It is {{ $value }} connections for {{$labels.instance}}"
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job)
|
||||
(up == 1)
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
labels:
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
recording: true
|
||||
- record: successful_requests:ratio_rate5m
|
||||
labels:
|
||||
recording: true
|
||||
expr: |2
|
||||
sum(code:requests:rate5m{code="200"})
|
||||
/
|
||||
sum(code:requests:rate5m)
|
||||
@@ -1,39 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected syntaxt error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules0-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template annotation error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules1-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected same group error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/dir/rules2-bad.rules"}, true); err == nil {
|
||||
t.Errorf("expected template label error")
|
||||
}
|
||||
if _, err := Parse([]string{"testdata/*.yaml"}, true); err == nil {
|
||||
t.Errorf("expected empty group")
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,10 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
@@ -15,17 +17,49 @@ import (
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
Name string
|
||||
File string
|
||||
Rules []*Rule
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Interval time.Duration
|
||||
|
||||
done chan struct{}
|
||||
finished chan struct{}
|
||||
doneCh chan struct{}
|
||||
finishedCh chan struct{}
|
||||
// channel accepts new Group obj
|
||||
// which supposed to update current group
|
||||
updateCh chan *Group
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
g := &Group{
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval,
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
if g.Interval == 0 {
|
||||
g.Interval = defaultInterval
|
||||
}
|
||||
rules := make([]Rule, len(cfg.Rules))
|
||||
for i, r := range cfg.Rules {
|
||||
rules[i] = g.newRule(r)
|
||||
}
|
||||
g.Rules = rules
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(g.ID(), rule)
|
||||
}
|
||||
return newRecordingRule(g.ID(), rule)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
// rules file and group name
|
||||
func (g Group) ID() uint64 {
|
||||
func (g *Group) ID() uint64 {
|
||||
hash := fnv.New64a()
|
||||
hash.Write([]byte(g.File))
|
||||
hash.Write([]byte("\xff"))
|
||||
@@ -33,50 +67,62 @@ func (g Group) ID() uint64 {
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for all group rules with For > 0
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
if rule.For == 0 {
|
||||
return nil
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if err := rule.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %s", rule, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateWith updates existing group with
|
||||
// passed group object.
|
||||
func (g *Group) updateWith(newGroup Group) {
|
||||
rulesRegistry := make(map[string]*Rule)
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
for _, nr := range newGroup.Rules {
|
||||
rulesRegistry[nr.id()] = nr
|
||||
rulesRegistry[nr.ID()] = nr
|
||||
}
|
||||
|
||||
for i, or := range g.Rules {
|
||||
nr, ok := rulesRegistry[or.id()]
|
||||
nr, ok := rulesRegistry[or.ID()]
|
||||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// and must be removed
|
||||
or = nil
|
||||
g.Rules = append(g.Rules[:i], g.Rules[i+1:]...)
|
||||
// so we mark it for removing
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
|
||||
// copy all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Evals
|
||||
or.For = nr.For
|
||||
or.Expr = nr.Expr
|
||||
or.Labels = nr.Labels
|
||||
or.Annotations = nr.Annotations
|
||||
delete(rulesRegistry, nr.id())
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
}
|
||||
|
||||
var newRules []Rule
|
||||
for _, r := range g.Rules {
|
||||
if r == nil {
|
||||
// skip nil rules
|
||||
continue
|
||||
}
|
||||
newRules = append(newRules, r)
|
||||
}
|
||||
// add the rest of rules from registry
|
||||
for _, nr := range rulesRegistry {
|
||||
g.Rules = append(g.Rules, nr)
|
||||
newRules = append(newRules, nr)
|
||||
}
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -96,28 +142,48 @@ var (
|
||||
)
|
||||
|
||||
func (g *Group) close() {
|
||||
if g.done == nil {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
}
|
||||
close(g.done)
|
||||
<-g.finished
|
||||
close(g.doneCh)
|
||||
<-g.finishedCh
|
||||
}
|
||||
|
||||
func (g *Group) start(ctx context.Context, interval time.Duration,
|
||||
querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
|
||||
logger.Infof("group %q started", g.Name)
|
||||
t := time.NewTicker(interval)
|
||||
func (g *Group) start(ctx context.Context, querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
|
||||
logger.Infof("group %q started with interval %v", g.Name, g.Interval)
|
||||
|
||||
var returnSeries bool
|
||||
if rw != nil {
|
||||
returnSeries = true
|
||||
}
|
||||
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Infof("group %q: context cancelled", g.Name)
|
||||
close(g.finished)
|
||||
close(g.finishedCh)
|
||||
return
|
||||
case <-g.done:
|
||||
case <-g.doneCh:
|
||||
logger.Infof("group %q: received stop signal", g.Name)
|
||||
close(g.finished)
|
||||
close(g.finishedCh)
|
||||
return
|
||||
case ng := <-g.updateCh:
|
||||
g.mu.Lock()
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||
g.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
if g.Interval != ng.Interval {
|
||||
g.Interval = ng.Interval
|
||||
t.Stop()
|
||||
t = time.NewTicker(g.Interval)
|
||||
logger.Infof("group %q: changed evaluation interval to %v", g.Name, g.Interval)
|
||||
}
|
||||
g.mu.Unlock()
|
||||
case <-t.C:
|
||||
iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
@@ -125,39 +191,53 @@ func (g *Group) start(ctx context.Context, interval time.Duration,
|
||||
execTotal.Inc()
|
||||
|
||||
execStart := time.Now()
|
||||
err := rule.Exec(ctx, querier)
|
||||
tss, err := rule.Exec(ctx, querier, returnSeries)
|
||||
execDuration.UpdateDuration(execStart)
|
||||
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
logger.Errorf("failed to execute rule %q.%q: %s", g.Name, rule.Name, err)
|
||||
logger.Errorf("failed to execute rule %q.%q: %s", g.Name, rule, err)
|
||||
continue
|
||||
}
|
||||
|
||||
var alertsToSend []notifier.Alert
|
||||
for _, a := range rule.alerts {
|
||||
if a.State != notifier.StatePending {
|
||||
alertsToSend = append(alertsToSend, *a)
|
||||
}
|
||||
if a.State == notifier.StateInactive || rw == nil {
|
||||
continue
|
||||
}
|
||||
tss := rule.AlertToTimeSeries(a, execStart)
|
||||
if len(tss) > 0 {
|
||||
remoteWriteSent.Add(len(tss))
|
||||
for _, ts := range tss {
|
||||
remoteWriteSent.Inc()
|
||||
if err := rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
||||
logger.Errorf("failed to remote write for rule %q.%q: %s", g.Name, rule, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(alertsToSend) > 0 {
|
||||
alertsSent.Add(len(alertsToSend))
|
||||
if err := nr.Send(ctx, alertsToSend); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
// set End to execStart + 3 intervals
|
||||
// so notifier can resolve it automatically if `vmalert`
|
||||
// won't be able to send resolve for some reason
|
||||
a.End = execStart.Add(3 * g.Interval)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = execStart
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
if len(alerts) < 1 {
|
||||
continue
|
||||
}
|
||||
alertsSent.Add(len(alerts))
|
||||
if err := nr.Send(ctx, alerts); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule, err)
|
||||
}
|
||||
}
|
||||
iterationDuration.UpdateDuration(iterationStart)
|
||||
}
|
||||
|
||||
@@ -4,27 +4,28 @@ import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
currentRules []*Rule
|
||||
newRules []*Rule
|
||||
currentRules []Rule
|
||||
// rules must be sorted by ID
|
||||
newRules []Rule
|
||||
}{
|
||||
{
|
||||
"new rule",
|
||||
[]*Rule{},
|
||||
[]*Rule{{Name: "bar"}},
|
||||
[]Rule{},
|
||||
[]Rule{&AlertingRule{Name: "bar"}},
|
||||
},
|
||||
{
|
||||
"update rule",
|
||||
[]*Rule{{
|
||||
"update alerting rule",
|
||||
[]Rule{&AlertingRule{
|
||||
Name: "foo",
|
||||
Expr: "up > 0",
|
||||
For: time.Second,
|
||||
@@ -36,8 +37,8 @@ func TestUpdateWith(t *testing.T) {
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}},
|
||||
[]*Rule{{
|
||||
Name: "bar",
|
||||
[]Rule{&AlertingRule{
|
||||
Name: "foo",
|
||||
Expr: "up > 10",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
@@ -48,43 +49,82 @@ func TestUpdateWith(t *testing.T) {
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
[]Rule{&RecordingRule{
|
||||
Name: "foo",
|
||||
Expr: "max(up)",
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
[]Rule{&RecordingRule{
|
||||
Name: "foo",
|
||||
Expr: "min(up)",
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"empty rule",
|
||||
[]*Rule{{Name: "foo"}},
|
||||
[]*Rule{},
|
||||
[]Rule{&AlertingRule{Name: "foo"}, &RecordingRule{Name: "bar"}},
|
||||
[]Rule{},
|
||||
},
|
||||
{
|
||||
"multiple rules",
|
||||
[]*Rule{{Name: "foo"}, {Name: "bar"}, {Name: "baz"}},
|
||||
[]*Rule{{Name: "foo"}, {Name: "baz"}},
|
||||
[]Rule{
|
||||
&AlertingRule{Name: "bar"},
|
||||
&AlertingRule{Name: "baz"},
|
||||
&RecordingRule{Name: "foo"},
|
||||
},
|
||||
[]Rule{
|
||||
&AlertingRule{Name: "baz"},
|
||||
&RecordingRule{Name: "foo"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"replace rule",
|
||||
[]Rule{&AlertingRule{Name: "foo1"}},
|
||||
[]Rule{&AlertingRule{Name: "foo2"}},
|
||||
},
|
||||
{
|
||||
"replace multiple rules",
|
||||
[]Rule{
|
||||
&AlertingRule{Name: "foo1"},
|
||||
&RecordingRule{Name: "foo2"},
|
||||
&AlertingRule{Name: "foo3"},
|
||||
},
|
||||
[]Rule{
|
||||
&AlertingRule{Name: "foo3"},
|
||||
&AlertingRule{Name: "foo4"},
|
||||
&RecordingRule{Name: "foo5"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Rules: tc.currentRules}
|
||||
g.updateWith(Group{Rules: tc.newRules})
|
||||
err := g.updateWith(&Group{Rules: tc.newRules})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(g.Rules) != len(tc.newRules) {
|
||||
t.Fatalf("expected to have %d rules; got: %d",
|
||||
len(g.Rules), len(tc.newRules))
|
||||
}
|
||||
sort.Slice(g.Rules, func(i, j int) bool {
|
||||
return g.Rules[i].ID() < g.Rules[j].ID()
|
||||
})
|
||||
for i, r := range g.Rules {
|
||||
got, want := r, tc.newRules[i]
|
||||
if got.Name != want.Name {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
|
||||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if got.Expr != want.Expr {
|
||||
t.Fatalf("expected to have expression %q; got %q", want.Expr, got.Expr)
|
||||
}
|
||||
if got.For != want.For {
|
||||
t.Fatalf("expected to have for %q; got %q", want.For, got.For)
|
||||
}
|
||||
if !reflect.DeepEqual(got.Annotations, want.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", want.Annotations, got.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(got.Labels, want.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", want.Labels, got.Labels)
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -93,11 +133,12 @@ func TestUpdateWith(t *testing.T) {
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := Parse([]string{"testdata/rules1-good.rules"}, true)
|
||||
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
g := groups[0]
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval)
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fs := &fakeQuerier{}
|
||||
@@ -106,27 +147,26 @@ func TestGroupStart(t *testing.T) {
|
||||
m1 := metricWithLabels(t, "instance", inst1, "job", job)
|
||||
m2 := metricWithLabels(t, "instance", inst2, "job", job)
|
||||
|
||||
r := g.Rules[0]
|
||||
alert1, err := r.newAlert(m1)
|
||||
r := g.Rules[0].(*AlertingRule)
|
||||
alert1, err := r.newAlert(m1, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert1.State = notifier.StateFiring
|
||||
alert1.ID = hash(m1)
|
||||
|
||||
alert2, err := r.newAlert(m2)
|
||||
alert2, err := r.newAlert(m2, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert2.State = notifier.StateFiring
|
||||
alert2.ID = hash(m2)
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), evalInterval, fs, fn, nil)
|
||||
g.start(context.Background(), fs, fn, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
@@ -183,21 +223,3 @@ func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
200
app/vmalert/helpers_test.go
Normal file
200
app/vmalert/helpers_test.go
Normal file
@@ -0,0 +1,200 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
err error
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) setErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return nil, fq.err
|
||||
}
|
||||
cp := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
return cp, nil
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := compareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareRules(t *testing.T, a, b Rule) error {
|
||||
t.Helper()
|
||||
switch v := a.(type) {
|
||||
case *AlertingRule:
|
||||
br, ok := b.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
|
||||
}
|
||||
return compareAlertingRules(t, v, br)
|
||||
case *RecordingRule:
|
||||
br, ok := b.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
|
||||
}
|
||||
return compareRecordingRules(t, v, br)
|
||||
default:
|
||||
return fmt.Errorf("unexpected rule type received %T", a)
|
||||
}
|
||||
}
|
||||
|
||||
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if a.For != b.For {
|
||||
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
|
||||
t.Helper()
|
||||
if len(a) != len(b) {
|
||||
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
|
||||
}
|
||||
for i := range a {
|
||||
expTS, gotTS := a[i], b[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
// timestamp validation isn't always correct for now.
|
||||
// this must be improved with time mock.
|
||||
/*if got.Timestamp != exp.Timestamp {
|
||||
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}*/
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -37,25 +38,31 @@ absolute path to all .yaml files in root.`)
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
|
||||
remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
remoteWriteURL = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
remoteWriteUsername = flag.String("remotewrite.basicAuth.username", "", "Optional basic auth username for -remotewrite.url")
|
||||
remoteWritePassword = flag.String("remotewrite.basicAuth.password", "", "Optional basic auth password for -remotewrite.url")
|
||||
remoteWriteUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
remoteWritePassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
remoteWriteMaxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
remoteWriteMaxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
|
||||
remoteWriteConcurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of readers that concurrently write into remote storage")
|
||||
|
||||
remoteReadURL = flag.String("remoteread.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state."+
|
||||
remoteReadURL = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
remoteReadUsername = flag.String("remoteread.basicAuth.username", "", "Optional basic auth username for -remoteread.url")
|
||||
remoteReadPassword = flag.String("remoteread.basicAuth.password", "", "Optional basic auth password for -remoteread.url")
|
||||
remoteReadLookBack = flag.Duration("remoteread.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
remoteReadUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
remoteReadPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
@@ -77,6 +84,9 @@ func main() {
|
||||
if *remoteWriteURL != "" {
|
||||
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
|
||||
Addr: *remoteWriteURL,
|
||||
Concurrency: *remoteWriteConcurrency,
|
||||
MaxQueueSize: *remoteWriteMaxQueueSize,
|
||||
MaxBatchSize: *remoteWriteMaxBatchSize,
|
||||
FlushInterval: *evaluationInterval,
|
||||
BasicAuthUser: *remoteWriteUsername,
|
||||
BasicAuthPass: *remoteWritePassword,
|
||||
@@ -97,7 +107,7 @@ func main() {
|
||||
go func() {
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(uint64(time.Now().UnixNano()) / 1e9)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
sigHup := procutil.NewSighupChan()
|
||||
for {
|
||||
<-sigHup
|
||||
@@ -110,13 +120,13 @@ func main() {
|
||||
continue
|
||||
}
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(uint64(time.Now().UnixNano()) / 1e9)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}()
|
||||
|
||||
rh := &requestHandler{m: manager}
|
||||
go httpserver.Serve(*httpListenAddr, (rh).handler)
|
||||
go httpserver.Serve(*httpListenAddr, rh.handler)
|
||||
|
||||
sig := procutil.WaitForSigterm()
|
||||
logger.Infof("service received signal %s", sig)
|
||||
@@ -163,3 +173,15 @@ func checkFlags() {
|
||||
logger.Fatalf("datasource.url is empty")
|
||||
}
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmalert processes alerts and recording rules.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -6,12 +6,14 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
storage datasource.Querier
|
||||
notifier notifier.Notifier
|
||||
@@ -25,7 +27,7 @@ type manager struct {
|
||||
groups map[uint64]*Group
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
@@ -35,11 +37,15 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
return nil, fmt.Errorf("can't find group with id %q", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if apiAlert := rule.AlertAPI(aID); apiAlert != nil {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("can't func alert with id %q in group %q", aID, g.Name)
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validate bool) error {
|
||||
@@ -56,8 +62,8 @@ func (m *manager) close() {
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
|
||||
if restore {
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
@@ -67,28 +73,28 @@ func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
|
||||
m.wg.Add(1)
|
||||
id := group.ID()
|
||||
go func() {
|
||||
group.start(ctx, *evaluationInterval, m.storage, m.notifier, m.rw)
|
||||
group.start(ctx, m.storage, m.notifier, m.rw)
|
||||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = &group
|
||||
m.groups[id] = group
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validate, restore bool) error {
|
||||
logger.Infof("reading alert rules configuration file from %q", strings.Join(path, ";"))
|
||||
newGroups, err := Parse(path, validate)
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
|
||||
groupsCfg, err := config.Parse(path, validate)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse configuration file: %s", err)
|
||||
}
|
||||
|
||||
groupsRegistry := make(map[uint64]Group)
|
||||
for _, ng := range newGroups {
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
m.groupsMu.Lock()
|
||||
for _, og := range m.groups {
|
||||
id := og.ID()
|
||||
ng, ok := groupsRegistry[id]
|
||||
ng, ok := groupsRegistry[og.ID()]
|
||||
if !ok {
|
||||
// old group is not present in new list
|
||||
// and must be stopped and deleted
|
||||
@@ -97,7 +103,7 @@ func (m *manager) update(ctx context.Context, path []string, validate, restore b
|
||||
og = nil
|
||||
continue
|
||||
}
|
||||
og.updateWith(ng)
|
||||
og.updateCh <- ng
|
||||
delete(groupsRegistry, ng.ID())
|
||||
}
|
||||
|
||||
@@ -107,3 +113,22 @@ func (m *manager) update(ctx context.Context, path []string, validate, restore b
|
||||
m.groupsMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
ag := APIGroup{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
Name: g.Name,
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
case *AlertingRule:
|
||||
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
||||
case *RecordingRule:
|
||||
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
||||
}
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
@@ -3,12 +3,22 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestManagerUpdateError(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
path := []string{"foo/bar"}
|
||||
@@ -26,24 +36,36 @@ func TestManagerUpdateError(t *testing.T) {
|
||||
// execution of configuration update.
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
storage: &fakeQuerier{},
|
||||
notifier: &fakeNotifier{},
|
||||
}
|
||||
paths := []string{
|
||||
"testdata/dir/rules0-good.rules",
|
||||
"testdata/dir/rules1-good.rules",
|
||||
"testdata/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-bad.rules",
|
||||
"config/testdata/dir/rules1-good.rules",
|
||||
"config/testdata/dir/rules1-bad.rules",
|
||||
"config/testdata/rules0-good.rules",
|
||||
"config/testdata/rules1-good.rules",
|
||||
"config/testdata/rules2-good.rules",
|
||||
}
|
||||
*evaluationInterval = time.Millisecond
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true); err != nil {
|
||||
t.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
const n = 500
|
||||
const workers = 500
|
||||
const iterations = 10
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(n)
|
||||
for i := 0; i < n; i++ {
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
err := m.update(context.Background(), path, true, false)
|
||||
if err != nil {
|
||||
t.Errorf("update error: %s", err)
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
_ = m.update(context.Background(), path, true, false)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -53,6 +75,41 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
// TestManagerUpdate tests sequential configuration
|
||||
// updates.
|
||||
func TestManagerUpdate(t *testing.T) {
|
||||
const defaultEvalInterval = time.Second * 30
|
||||
currentEvalInterval := *evaluationInterval
|
||||
*evaluationInterval = defaultEvalInterval
|
||||
defer func() {
|
||||
*evaluationInterval = currentEvalInterval
|
||||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
Labels: map[string]string{
|
||||
"label": "bar",
|
||||
"host": "{{ $labels.instance }}",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
"summary": "Too high connection number for {{$labels.instance}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
initPath string
|
||||
@@ -61,49 +118,65 @@ func TestManagerUpdate(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "testdata/rules0-good.rules",
|
||||
updatePath: "testdata/dir/rules1-good.rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Rules: []*Rule{newTestRule("VMRows", time.Second*10)},
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
Labels: map[string]string{"label": "bar"},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "testdata/dir/rules1-good.rules",
|
||||
updatePath: "testdata/rules0-good.rules",
|
||||
initPath: "config/testdata/dir/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules0-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert", Rules: []*Rule{
|
||||
newTestRule("VMRows", time.Second*10),
|
||||
}},
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
},
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "TestGroup", Rules: []*Rule{
|
||||
newTestRule("Conns", time.Duration(0)),
|
||||
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update with one bad rule file",
|
||||
initPath: "testdata/rules0-good.rules",
|
||||
updatePath: "testdata/dir/rules2-bad.rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert", Rules: []*Rule{
|
||||
newTestRule("VMRows", time.Second*10),
|
||||
}},
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "testdata/rules0-good.rules",
|
||||
Name: "TestGroup", Rules: []*Rule{
|
||||
newTestRule("Conns", time.Duration(0)),
|
||||
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
@@ -111,7 +184,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m := &manager{groups: make(map[uint64]*Group), storage: &fakeQuerier{}}
|
||||
path := []string{tc.initPath}
|
||||
if err := m.update(ctx, path, true, false); err != nil {
|
||||
t.Fatalf("failed to complete initial rules update: %s", err)
|
||||
@@ -128,7 +201,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("expected to have group %q", wantG.Name)
|
||||
}
|
||||
compareGroups(t, gotG, wantG)
|
||||
compareGroups(t, wantG, gotG)
|
||||
}
|
||||
|
||||
cancel()
|
||||
@@ -136,17 +209,3 @@ func TestManagerUpdate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if got.Name != want.Name {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ type Alert struct {
|
||||
Annotations map[string]string
|
||||
State AlertState
|
||||
|
||||
Expr string
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Value float64
|
||||
@@ -52,14 +53,15 @@ func (as AlertState) String() string {
|
||||
type alertTplData struct {
|
||||
Labels map[string]string
|
||||
Value float64
|
||||
Expr string
|
||||
}
|
||||
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}`
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
|
||||
|
||||
// ExecTemplate executes the Alert template for give
|
||||
// map of annotations.
|
||||
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
|
||||
tplData := alertTplData{Value: a.Value, Labels: a.Labels}
|
||||
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
|
||||
return templateAnnotations(annotations, tplHeader, tplData)
|
||||
}
|
||||
|
||||
@@ -85,7 +87,7 @@ func templateAnnotations(annotations map[string]string, header string, data aler
|
||||
builder.WriteString(header)
|
||||
builder.WriteString(text)
|
||||
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
|
||||
eg.errs = append(eg.errs, fmt.Sprintf("key %s, template %s:%s", key, text, err))
|
||||
eg.errs = append(eg.errs, fmt.Sprintf("key %q, template %q: %s", key, text, err))
|
||||
continue
|
||||
}
|
||||
r[key] = buf.String()
|
||||
@@ -96,10 +98,10 @@ func templateAnnotations(annotations map[string]string, header string, data aler
|
||||
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
|
||||
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing annotation:%w", err)
|
||||
return fmt.Errorf("error parsing annotation: %w", err)
|
||||
}
|
||||
if err = tpl.Execute(dst, data); err != nil {
|
||||
return fmt.Errorf("error evaluating annotation template:%w", err)
|
||||
return fmt.Errorf("error evaluating annotation template: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,22 +1,27 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAlert_ExecTemplate(t *testing.T) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
InitTemplateFunc(u)
|
||||
testCases := []struct {
|
||||
name string
|
||||
alert *Alert
|
||||
annotations map[string]string
|
||||
expTpl map[string]string
|
||||
}{
|
||||
{
|
||||
name: "empty-alert",
|
||||
alert: &Alert{},
|
||||
annotations: map[string]string{},
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "no-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
@@ -27,6 +32,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "label-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
@@ -43,10 +49,24 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
"description": "It is 10000 connections for localhost",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "expression-template",
|
||||
alert: &Alert{
|
||||
Expr: `vm_rows{"label"="bar"}>0`,
|
||||
},
|
||||
annotations: map[string]string{
|
||||
"exprEscapedQuery": "{{ $expr|quotesEscape|queryEscape }}",
|
||||
"exprEscapedPath": "{{ $expr|quotesEscape|pathEscape }}",
|
||||
},
|
||||
expTpl: map[string]string{
|
||||
"exprEscapedQuery": "vm_rows%7B%5C%22label%5C%22%3D%5C%22bar%5C%22%7D%3E0",
|
||||
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tpl, err := tc.alert.ExecTemplate(tc.annotations)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
|
||||
@@ -142,6 +142,15 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
"externalURL": func() string {
|
||||
return externalURL.String()
|
||||
},
|
||||
"pathEscape": func(u string) string {
|
||||
return url.PathEscape(u)
|
||||
},
|
||||
"queryEscape": func(q string) string {
|
||||
return url.QueryEscape(q)
|
||||
},
|
||||
"quotesEscape": func(q string) string {
|
||||
return strings.Replace(q, `"`, `\"`, -1)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,5 +17,5 @@ func (eg *errGroup) err() error {
|
||||
}
|
||||
|
||||
func (eg *errGroup) Error() string {
|
||||
return fmt.Sprintf("errors:%s", strings.Join(eg.errs, "\n"))
|
||||
return fmt.Sprintf("errors: %s", strings.Join(eg.errs, "\n"))
|
||||
}
|
||||
|
||||
151
app/vmalert/recording.go
Normal file
151
app/vmalert/recording.go
Normal file
@@ -0,0 +1,151 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// RecordingRule is a Rule that supposed
|
||||
// to evaluate configured Expression and
|
||||
// return TimeSeries as result.
|
||||
type RecordingRule struct {
|
||||
Name string
|
||||
Expr string
|
||||
Labels map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (rr *RecordingRule) String() string {
|
||||
return rr.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (rr *RecordingRule) ID() uint64 {
|
||||
hash := fnv.New64a()
|
||||
hash.Write([]byte("alerting"))
|
||||
hash.Write([]byte("\xff"))
|
||||
hash.Write([]byte(rr.Name))
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
|
||||
return &RecordingRule{
|
||||
Name: cfg.Record,
|
||||
Expr: cfg.Expr,
|
||||
Labels: cfg.Labels,
|
||||
GroupID: gID,
|
||||
}
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
if !series {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
qMetrics, err := q.Query(ctx, rr.Expr)
|
||||
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
rr.lastExecTime = time.Now()
|
||||
rr.lastExecError = err
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %s", rr.Expr, err)
|
||||
}
|
||||
|
||||
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, r := range qMetrics {
|
||||
ts := rr.toTimeSeries(r, rr.lastExecTime)
|
||||
h := hashTimeSeries(ts)
|
||||
if _, ok := duplicates[h]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, errDuplicate
|
||||
}
|
||||
duplicates[h] = ts
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := ts.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
labels[l.Name] = l.Value
|
||||
}
|
||||
labels["__name__"] = rr.Name
|
||||
// override existing labels with configured ones
|
||||
for k, v := range rr.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Value, labels, timestamp)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
return nil
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIRecordingRule
|
||||
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
||||
var lastErr string
|
||||
if rr.lastExecError != nil {
|
||||
lastErr = rr.lastExecError.Error()
|
||||
}
|
||||
return APIRecordingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
Name: rr.Name,
|
||||
Expression: rr.Expr,
|
||||
LastError: lastErr,
|
||||
LastExec: rr.lastExecTime,
|
||||
Labels: rr.Labels,
|
||||
}
|
||||
}
|
||||
121
app/vmalert/recording_test.go
Normal file
121
app/vmalert/recording_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
metrics []datasource.Metric
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
[]datasource.Metric{metricWithValueAndLabels(t, 10,
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(10, map[string]string{
|
||||
"__name__": "foo",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
|
||||
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}, timestamp),
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}, timestamp),
|
||||
newTimeSeries(3, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tss, err := tc.rule.Exec(context.TODO(), fq, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
|
||||
_, err := rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), errDuplicate.Error()) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
|
||||
}
|
||||
}
|
||||
@@ -38,11 +38,15 @@ type Config struct {
|
||||
BasicAuthUser string
|
||||
BasicAuthPass string
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
@@ -52,9 +56,10 @@ type Config struct {
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 100
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
@@ -90,7 +95,13 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
c.run(ctx)
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
@@ -103,7 +114,8 @@ func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries)",
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
@@ -127,7 +139,10 @@ func (c *Client) run(ctx context.Context) {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
|
||||
@@ -2,334 +2,23 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// Rule is basic alert entity
|
||||
type Rule struct {
|
||||
Name string `yaml:"alert"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels"`
|
||||
Annotations map[string]string `yaml:"annotations"`
|
||||
|
||||
group Group
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func (r *Rule) id() string {
|
||||
return r.Name
|
||||
}
|
||||
|
||||
// Validate validates rule
|
||||
func (r *Rule) Validate() error {
|
||||
if r.Name == "" {
|
||||
return errors.New("rule name can not be empty")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
|
||||
}
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Exec executes Rule expression via the given Querier.
|
||||
// Based on the Querier results Rule maintains notifier.Alerts
|
||||
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
||||
qMetrics, err := q.Query(ctx, r.Expr)
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
r.lastExecError = err
|
||||
r.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// cleanup inactive alerts from previous Eval
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(r.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := r.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = r.template(a)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
r.lastExecError = err
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
r.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range r.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
a.State = notifier.StateInactive
|
||||
// set endTime to last execution time
|
||||
// so it can be sent by notifier on next step
|
||||
a.End = r.lastExecTime
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
if a.State == notifier.StateFiring {
|
||||
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: r.group.ID(),
|
||||
Name: r.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: time.Now(),
|
||||
// TODO: support End time
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, r.template(a)
|
||||
}
|
||||
|
||||
func (r *Rule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(r.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(r.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (r *Rule) AlertAPI(id uint64) *APIAlert {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
a, ok := r.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return r.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (r *Rule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
r.mu.RLock()
|
||||
for _, a := range r.alerts {
|
||||
alerts = append(alerts, r.newAlertAPI(*a))
|
||||
}
|
||||
r.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: r.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// AlertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
|
||||
if r.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Eval, as well as Value field.
|
||||
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, r.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Eval
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := r.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := r.newAlert(m)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
a.Start = time.Unix(int64(m.Value), 0)
|
||||
r.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// Returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
// and Querier. If returnSeries is true, Exec
|
||||
// may return TimeSeries as result of execution
|
||||
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
}
|
||||
|
||||
27
app/vmalert/utils.go
Normal file
27
app/vmalert/utils.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"sort"
|
||||
"time"
|
||||
)
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
@@ -7,32 +7,18 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.Alert state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
type requestHandler struct {
|
||||
m *manager
|
||||
}
|
||||
|
||||
var pathList = [][]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
// /metrics is served by httpserver by default
|
||||
@@ -49,8 +35,11 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
}
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
resph.handle(rh.listGroups())
|
||||
return true
|
||||
case "/api/v1/alerts":
|
||||
resph.handle(rh.list())
|
||||
resph.handle(rh.listAlerts())
|
||||
return true
|
||||
case "/-/reload":
|
||||
logger.Infof("api config reload was called, sending sighup")
|
||||
@@ -67,6 +56,37 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
}
|
||||
|
||||
type listGroupsResponse struct {
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listGroupsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
sort.Slice(lr.Data.Groups, func(i, j int) bool {
|
||||
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
|
||||
})
|
||||
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
type listAlertsResponse struct {
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
@@ -74,13 +94,18 @@ type listAlertsResponse struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) list() ([]byte, error) {
|
||||
func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listAlertsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
for _, r := range g.Rules {
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
|
||||
a, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
)
|
||||
|
||||
func TestHandler(t *testing.T) {
|
||||
rule := &Rule{
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {},
|
||||
@@ -19,7 +19,7 @@ func TestHandler(t *testing.T) {
|
||||
}
|
||||
g := &Group{
|
||||
Name: "group",
|
||||
Rules: []*Rule{rule},
|
||||
Rules: []Rule{ar},
|
||||
}
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m.groups[0] = g
|
||||
@@ -54,10 +54,17 @@ func TestHandler(t *testing.T) {
|
||||
t.Errorf("expected 1 alert got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/groups", func(t *testing.T) {
|
||||
lr := listGroupsResponse{}
|
||||
getResp(ts.URL+"/api/v1/groups", &lr, 200)
|
||||
if length := len(lr.Data.Groups); length != 1 {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/0/status", func(t *testing.T) {
|
||||
alert := &APIAlert{}
|
||||
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
|
||||
expAlert := rule.newAlertAPI(*rule.alerts[0])
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
|
||||
53
app/vmalert/web_types.go
Normal file
53
app/vmalert/web_types.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.AlertingRule state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
type APIAlertingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
For string `json:"for"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
}
|
||||
|
||||
// APIRecordingRule represents RecordingRule for WEB view
|
||||
type APIRecordingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
}
|
||||
@@ -53,6 +53,8 @@ publish-vmauth:
|
||||
|
||||
run-vmauth:
|
||||
APP_NAME=vmauth \
|
||||
DOCKER_OPTS='-v $(shell pwd)/app/vmauth/:/app/vmauth' \
|
||||
ARGS='-auth.config=app/vmauth/example_config.yml' \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
vmauth-amd64:
|
||||
|
||||
@@ -36,11 +36,11 @@ type UserInfo struct {
|
||||
|
||||
func initAuthConfig() {
|
||||
if len(*authConfigPath) == 0 {
|
||||
logger.Panicf("FATAL: missing required `-auth.config` command-line flag")
|
||||
logger.Fatalf("missing required `-auth.config` command-line flag")
|
||||
}
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
}
|
||||
authConfig.Store(m)
|
||||
stopCh = make(chan struct{})
|
||||
@@ -63,12 +63,14 @@ func authConfigReloader() {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-sighupCh:
|
||||
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
|
||||
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
|
||||
continue
|
||||
}
|
||||
authConfig.Store(m)
|
||||
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
31
app/vmauth/example_config.yml
Normal file
31
app/vmauth/example_config.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Arbitrary number of usernames may be put here.
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
|
||||
@@ -2,9 +2,11 @@ package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
@@ -19,6 +21,9 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
@@ -74,6 +79,26 @@ var reverseProxy = &httputil.ReverseProxy{
|
||||
}
|
||||
r.URL = target
|
||||
},
|
||||
Transport: func() *http.Transport {
|
||||
tr := http.DefaultTransport.(*http.Transport).Clone()
|
||||
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
|
||||
tr.DisableCompression = true
|
||||
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
|
||||
tr.ForceAttemptHTTP2 = false
|
||||
return tr
|
||||
}(),
|
||||
FlushInterval: time.Second,
|
||||
ErrorLog: logger.StdErrorLogger(),
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
|
||||
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
|
||||
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
|
||||
creation of hourly, daily, weekly and monthly backups.
|
||||
|
||||
|
||||
### Use cases
|
||||
|
||||
|
||||
@@ -31,6 +31,8 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
|
||||
@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
|
||||
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
|
||||
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
|
||||
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
|
||||
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -52,7 +53,7 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -68,8 +69,8 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-src string
|
||||
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-storageDataPath string
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
|
||||
-version
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
|
||||
@@ -16,13 +17,16 @@ var (
|
||||
src = flag.String("src", "", "Source path with backup on the remote storage. "+
|
||||
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir")
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Destination path where backup must be restored. "+
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup")
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir "+
|
||||
"is synchronized with -src contents, i.e. it works like 'rsync --delete'")
|
||||
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce restore duration")
|
||||
maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum download speed. There is no limit if it is set to 0")
|
||||
skipBackupCompleteCheck = flag.Bool("skipBackupCompleteCheck", false, "Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -395,14 +396,14 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
if err := r.ParseForm(); err != nil {
|
||||
return fmt.Errorf("cannot parse form values: %s", err)
|
||||
}
|
||||
date := time.Now().Unix() / secsPerDay
|
||||
date := fasttime.UnixDate()
|
||||
dateStr := r.FormValue("date")
|
||||
if len(dateStr) > 0 {
|
||||
t, err := time.Parse("2006-01-02", dateStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
|
||||
}
|
||||
date = t.Unix() / secsPerDay
|
||||
date = uint64(t.Unix()) / secsPerDay
|
||||
}
|
||||
topN := 10
|
||||
topNStr := r.FormValue("topN")
|
||||
@@ -419,7 +420,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
|
||||
}
|
||||
topN = n
|
||||
}
|
||||
status, err := netstorage.GetTSDBStatusForDate(deadline, uint64(date), topN)
|
||||
status, err := netstorage.GetTSDBStatusForDate(deadline, date, topN)
|
||||
if err != nil {
|
||||
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
|
||||
}
|
||||
@@ -992,7 +993,7 @@ func getBool(r *http.Request, argKey string) bool {
|
||||
}
|
||||
|
||||
func currentTime() int64 {
|
||||
return int64(time.Now().UTC().Unix()) * 1e3
|
||||
return int64(fasttime.UnixTimestamp() * 1000)
|
||||
}
|
||||
|
||||
func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error) {
|
||||
|
||||
@@ -44,6 +44,7 @@ var aggrFuncs = map[string]aggrFunc{
|
||||
"bottomk_avg": newAggrFuncRangeTopK(avgValue, true),
|
||||
"bottomk_median": newAggrFuncRangeTopK(medianValue, true),
|
||||
"any": newAggrFunc(aggrFuncAny),
|
||||
"outliersk": aggrFuncOutliersK,
|
||||
}
|
||||
|
||||
type aggrFunc func(afa *aggrFuncArg) ([]*timeseries, error)
|
||||
@@ -483,11 +484,6 @@ func newAggrFuncTopK(isReverse bool) aggrFunc {
|
||||
}
|
||||
}
|
||||
|
||||
type tsWithValue struct {
|
||||
ts *timeseries
|
||||
value float64
|
||||
}
|
||||
|
||||
func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggrFunc {
|
||||
return func(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
@@ -499,34 +495,42 @@ func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggr
|
||||
return nil, err
|
||||
}
|
||||
afe := func(tss []*timeseries) []*timeseries {
|
||||
maxs := make([]tsWithValue, len(tss))
|
||||
for i, ts := range tss {
|
||||
value := f(ts.Values)
|
||||
maxs[i] = tsWithValue{
|
||||
ts: ts,
|
||||
value: value,
|
||||
}
|
||||
}
|
||||
sort.Slice(maxs, func(i, j int) bool {
|
||||
a := maxs[i].value
|
||||
b := maxs[j].value
|
||||
if isReverse {
|
||||
a, b = b, a
|
||||
}
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
for i := range maxs {
|
||||
tss[i] = maxs[i].ts
|
||||
}
|
||||
for i, k := range ks {
|
||||
fillNaNsAtIdx(i, k, tss)
|
||||
}
|
||||
return removeNaNs(tss)
|
||||
return getRangeTopKTimeseries(tss, ks, f, isReverse)
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
}
|
||||
|
||||
func getRangeTopKTimeseries(tss []*timeseries, ks []float64, f func(values []float64) float64, isReverse bool) []*timeseries {
|
||||
type tsWithValue struct {
|
||||
ts *timeseries
|
||||
value float64
|
||||
}
|
||||
maxs := make([]tsWithValue, len(tss))
|
||||
for i, ts := range tss {
|
||||
value := f(ts.Values)
|
||||
maxs[i] = tsWithValue{
|
||||
ts: ts,
|
||||
value: value,
|
||||
}
|
||||
}
|
||||
sort.Slice(maxs, func(i, j int) bool {
|
||||
a := maxs[i].value
|
||||
b := maxs[j].value
|
||||
if isReverse {
|
||||
a, b = b, a
|
||||
}
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
for i := range maxs {
|
||||
tss[i] = maxs[i].ts
|
||||
}
|
||||
for i, k := range ks {
|
||||
fillNaNsAtIdx(i, k, tss)
|
||||
}
|
||||
return removeNaNs(tss)
|
||||
}
|
||||
|
||||
func fillNaNsAtIdx(idx int, k float64, tss []*timeseries) {
|
||||
if math.IsNaN(k) {
|
||||
k = 0
|
||||
@@ -588,16 +592,54 @@ func avgValue(values []float64) float64 {
|
||||
func medianValue(values []float64) float64 {
|
||||
h := histogram.GetFast()
|
||||
for _, v := range values {
|
||||
if math.IsNaN(v) {
|
||||
continue
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
h.Update(v)
|
||||
}
|
||||
value := h.Quantile(0.5)
|
||||
histogram.PutFast(h)
|
||||
return value
|
||||
}
|
||||
|
||||
func aggrFuncOutliersK(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
if err := expectTransformArgsNum(args, 2); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ks, err := getScalar(args[0], 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
afe := func(tss []*timeseries) []*timeseries {
|
||||
// Calculate medians for each point across tss.
|
||||
medians := make([]float64, len(ks))
|
||||
h := histogram.GetFast()
|
||||
for n := range ks {
|
||||
h.Reset()
|
||||
for j := range tss {
|
||||
v := tss[j].Values[n]
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
}
|
||||
medians[n] = h.Quantile(0.5)
|
||||
}
|
||||
histogram.PutFast(h)
|
||||
|
||||
// Return topK time series with the highest variance from median.
|
||||
f := func(values []float64) float64 {
|
||||
sum2 := float64(0)
|
||||
for n, v := range values {
|
||||
d := v - medians[n]
|
||||
sum2 += d * d
|
||||
}
|
||||
return sum2
|
||||
}
|
||||
return getRangeTopKTimeseries(tss, ks, f, false)
|
||||
}
|
||||
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
|
||||
}
|
||||
|
||||
func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
args := afa.args
|
||||
if err := expectTransformArgsNum(args, 2); err != nil {
|
||||
@@ -658,24 +700,18 @@ func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
|
||||
func newAggrQuantileFunc(phis []float64) func(tss []*timeseries) []*timeseries {
|
||||
return func(tss []*timeseries) []*timeseries {
|
||||
dst := tss[0]
|
||||
h := histogram.GetFast()
|
||||
defer histogram.PutFast(h)
|
||||
for n := range dst.Values {
|
||||
sort.Slice(tss, func(i, j int) bool {
|
||||
a := tss[i].Values[n]
|
||||
b := tss[j].Values[n]
|
||||
return lessWithNaNs(a, b)
|
||||
})
|
||||
h.Reset()
|
||||
for j := range tss {
|
||||
v := tss[j].Values[n]
|
||||
if !math.IsNaN(v) {
|
||||
h.Update(v)
|
||||
}
|
||||
}
|
||||
phi := phis[n]
|
||||
if math.IsNaN(phi) {
|
||||
phi = 1
|
||||
}
|
||||
if phi < 0 {
|
||||
phi = 0
|
||||
}
|
||||
if phi > 1 {
|
||||
phi = 1
|
||||
}
|
||||
idx := int(math.Round(float64(len(tss)-1) * phi))
|
||||
dst.Values[n] = tss[idx].Values[n]
|
||||
dst.Values[n] = h.Quantile(phi)
|
||||
}
|
||||
tss[0] = dst
|
||||
return tss[:1]
|
||||
|
||||
@@ -465,7 +465,6 @@ func finalizeAggrGeomean(iac *incrementalAggrContext) {
|
||||
}
|
||||
|
||||
func updateAggrAny(iac *incrementalAggrContext, values []float64) {
|
||||
dstValues := iac.ts.Values
|
||||
dstCounts := iac.values
|
||||
if dstCounts[0] > 0 {
|
||||
return
|
||||
@@ -473,17 +472,16 @@ func updateAggrAny(iac *incrementalAggrContext, values []float64) {
|
||||
for i := range values {
|
||||
dstCounts[i] = 1
|
||||
}
|
||||
dstValues = append(dstValues[:0], values...)
|
||||
iac.ts.Values = append(iac.ts.Values[:0], values...)
|
||||
}
|
||||
|
||||
func mergeAggrAny(dst, src *incrementalAggrContext) {
|
||||
srcValues := src.ts.Values
|
||||
dstValues := dst.ts.Values
|
||||
srcCounts := src.values
|
||||
dstCounts := dst.values
|
||||
if dstCounts[0] > 0 {
|
||||
return
|
||||
}
|
||||
dstCounts[0] = srcCounts[0]
|
||||
dstValues = append(dstValues[:0], srcValues...)
|
||||
dst.ts.Values = append(dst.ts.Values[:0], srcValues...)
|
||||
}
|
||||
|
||||
@@ -3577,7 +3577,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `sort(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{14, 15, 12, 13, 15, 11},
|
||||
Values: []float64{14, 16, 12, 13, 15, 11},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
@@ -3607,7 +3607,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
}
|
||||
r3 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r3.MetricName.Tags = []storage.Tag{
|
||||
@@ -3628,7 +3628,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `sort(sum(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s])) by (vmrange))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{14, 15, 12, 13, 15, 11},
|
||||
Values: []float64{14, 16, 12, 13, 15, 11},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{
|
||||
@@ -3650,7 +3650,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
}
|
||||
r3 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r3.MetricName.Tags = []storage.Tag{
|
||||
@@ -3678,7 +3678,7 @@ func TestExecSuccess(t *testing.T) {
|
||||
q := `topk_max(1, histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{13, 11, 16, 19, 13, 16},
|
||||
Values: []float64{13, 10, 16, 19, 13, 16},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{
|
||||
@@ -4196,14 +4196,63 @@ func TestExecSuccess(t *testing.T) {
|
||||
t.Run(`quantile(NaN)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `quantile(NaN, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
|
||||
resultExpected := []netstorage.Result{}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(0)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `outliersk(0, (
|
||||
label_set(1300, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
))`
|
||||
resultExpected := []netstorage.Result{}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(1)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `outliersk(1, (
|
||||
label_set(2000, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
))`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("baz"),
|
||||
Value: []byte("sss"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`outliersk(3)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `sort_desc(outliersk(3, (
|
||||
label_set(1300, "foo", "bar"),
|
||||
label_set(time(), "baz", "sss"),
|
||||
)))`
|
||||
r1 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r1.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("baz"),
|
||||
Value: []byte("sss"),
|
||||
}}
|
||||
r2 := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{1300, 1300, 1300, 1300, 1300, 1300},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
r2.MetricName.Tags = []storage.Tag{{
|
||||
Key: []byte("foo"),
|
||||
Value: []byte("bar"),
|
||||
}}
|
||||
resultExpected := []netstorage.Result{r1, r2}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`range_quantile(0.5)`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `range_quantile(0.5, time())`
|
||||
@@ -5531,6 +5580,8 @@ func TestExecError(t *testing.T) {
|
||||
f(`hoeffding_bound_upper()`)
|
||||
f(`hoeffding_bound_upper(1)`)
|
||||
f(`hoeffding_bound_upper(0.99, foo, 1)`)
|
||||
f(`outliersk()`)
|
||||
f(`outliersk(1)`)
|
||||
|
||||
// Invalid argument type
|
||||
f(`median_over_time({}, 2)`)
|
||||
@@ -5570,6 +5621,7 @@ func TestExecError(t *testing.T) {
|
||||
f(`alias(1, 2)`)
|
||||
f(`aggr_over_time(1, 2)`)
|
||||
f(`aggr_over_time(("foo", "bar"), 3)`)
|
||||
f(`outliersk((label_set(1, "foo", "bar"), label_set(2, "x", "y")), 123)`)
|
||||
|
||||
// Duplicate timeseries
|
||||
f(`(label_set(1, "foo", "bar") or label_set(2, "foo", "baz"))
|
||||
|
||||
@@ -72,6 +72,8 @@ var rollupFuncs = map[string]newRollupFunc{
|
||||
"aggr_over_time": newRollupFuncTwoArgs(rollupFake),
|
||||
"hoeffding_bound_upper": newRollupHoeffdingBoundUpper,
|
||||
"hoeffding_bound_lower": newRollupHoeffdingBoundLower,
|
||||
"ascent_over_time": newRollupFuncOneArg(rollupAscentOverTime),
|
||||
"descent_over_time": newRollupFuncOneArg(rollupDescentOverTime),
|
||||
|
||||
// `timestamp` function must return timestamp for the last datapoint on the current window
|
||||
// in order to properly handle offset and timestamps unaligned to the current step.
|
||||
@@ -116,6 +118,9 @@ var rollupAggrFuncs = map[string]rollupFunc{
|
||||
"scrape_interval": rollupScrapeInterval,
|
||||
"tmin_over_time": rollupTmin,
|
||||
"tmax_over_time": rollupTmax,
|
||||
"ascent_over_time": rollupAscentOverTime,
|
||||
"descent_over_time": rollupDescentOverTime,
|
||||
"timestamp": rollupTimestamp,
|
||||
}
|
||||
|
||||
var rollupFuncsCannotAdjustWindow = map[string]bool{
|
||||
@@ -138,6 +143,8 @@ var rollupFuncsCannotAdjustWindow = map[string]bool{
|
||||
"increases_over_time": true,
|
||||
"decreases_over_time": true,
|
||||
"integrate": true,
|
||||
"ascent_over_time": true,
|
||||
"descent_over_time": true,
|
||||
}
|
||||
|
||||
var rollupFuncsRemoveCounterResets = map[string]bool{
|
||||
@@ -1527,6 +1534,52 @@ func rollupTimestamp(rfa *rollupFuncArg) float64 {
|
||||
return float64(timestamps[len(timestamps)-1]) / 1e3
|
||||
}
|
||||
|
||||
func rollupAscentOverTime(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
values := rfa.values
|
||||
prevValue := rfa.prevValue
|
||||
if math.IsNaN(prevValue) {
|
||||
if len(values) == 0 {
|
||||
return nan
|
||||
}
|
||||
prevValue = values[0]
|
||||
values = values[1:]
|
||||
}
|
||||
s := float64(0)
|
||||
for _, v := range values {
|
||||
d := v - prevValue
|
||||
if d > 0 {
|
||||
s += d
|
||||
}
|
||||
prevValue = v
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func rollupDescentOverTime(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
values := rfa.values
|
||||
prevValue := rfa.prevValue
|
||||
if math.IsNaN(prevValue) {
|
||||
if len(values) == 0 {
|
||||
return nan
|
||||
}
|
||||
prevValue = values[0]
|
||||
values = values[1:]
|
||||
}
|
||||
s := float64(0)
|
||||
for _, v := range values {
|
||||
d := prevValue - v
|
||||
if d > 0 {
|
||||
s += d
|
||||
}
|
||||
prevValue = v
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func rollupFirst(rfa *rollupFuncArg) float64 {
|
||||
// There is no need in handling NaNs here, since they must be cleaned up
|
||||
// before calling rollup funcs.
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
|
||||
@@ -64,18 +65,18 @@ func InitRollupResultCache(cachePath string) {
|
||||
|
||||
stats := &fastcache.Stats{}
|
||||
var statsLock sync.Mutex
|
||||
var statsLastUpdate time.Time
|
||||
var statsLastUpdate uint64
|
||||
fcs := func() *fastcache.Stats {
|
||||
statsLock.Lock()
|
||||
defer statsLock.Unlock()
|
||||
|
||||
if time.Since(statsLastUpdate) < time.Second {
|
||||
if fasttime.UnixTimestamp()-statsLastUpdate < 2 {
|
||||
return stats
|
||||
}
|
||||
var fcs fastcache.Stats
|
||||
c.UpdateStats(&fcs)
|
||||
stats = &fcs
|
||||
statsLastUpdate = time.Now()
|
||||
statsLastUpdate = fasttime.UnixTimestamp()
|
||||
return stats
|
||||
}
|
||||
if len(rollupResultCachePath) > 0 {
|
||||
|
||||
@@ -389,6 +389,9 @@ func TestRollupNewRollupFuncSuccess(t *testing.T) {
|
||||
f("ideriv", 0)
|
||||
f("decreases_over_time", 5)
|
||||
f("increases_over_time", 5)
|
||||
f("ascent_over_time", 142)
|
||||
f("descent_over_time", 231)
|
||||
f("timestamp", 0.13)
|
||||
}
|
||||
|
||||
func TestRollupNewRollupFuncError(t *testing.T) {
|
||||
|
||||
@@ -409,6 +409,16 @@ func registerStorageMetrics() {
|
||||
return float64(m().AddRowsConcurrencyCurrent)
|
||||
})
|
||||
|
||||
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
|
||||
return float64(m().SlowRowInserts)
|
||||
})
|
||||
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
|
||||
return float64(m().SlowPerDayIndexInserts)
|
||||
})
|
||||
metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
|
||||
return float64(m().SlowMetricNameLoads)
|
||||
})
|
||||
|
||||
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
|
||||
return float64(tm().BigRowsCount)
|
||||
})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,9 +2,9 @@
|
||||
|
||||
DOCKER_NAMESPACE := victoriametrics
|
||||
|
||||
ROOT_IMAGE ?= scratch
|
||||
CERTS_IMAGE := alpine:3.11
|
||||
GO_BUILDER_IMAGE := golang:1.14.2
|
||||
ROOT_IMAGE ?= alpine:3.12
|
||||
CERTS_IMAGE := alpine:3.12
|
||||
GO_BUILDER_IMAGE := golang:1.14.4
|
||||
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _)
|
||||
BASE_IMAGE := local/base:1.1.1-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _)
|
||||
|
||||
@@ -33,7 +33,7 @@ app-via-docker: package-base package-builder
|
||||
$(DOCKER_OPTS) \
|
||||
$(BUILDER_IMAGE) \
|
||||
go build $(RACE) -mod=vendor -trimpath \
|
||||
-ldflags "-s -w -extldflags '-static' $(GO_BUILDINFO)" \
|
||||
-ldflags "-extldflags '-static' $(GO_BUILDINFO)" \
|
||||
-tags 'netgo osusergo nethttpomithttp2' \
|
||||
-o bin/$(APP_NAME)$(APP_SUFFIX)-prod $(PKG_PREFIX)/app/$(APP_NAME)
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ version: '3.5'
|
||||
services:
|
||||
prometheus:
|
||||
container_name: prometheus
|
||||
image: prom/prometheus:v2.17.2
|
||||
image: prom/prometheus:v2.18.1
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -35,7 +35,7 @@ services:
|
||||
restart: always
|
||||
grafana:
|
||||
container_name: grafana
|
||||
image: grafana/grafana:6.7.2
|
||||
image: grafana/grafana:7.0.2
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
cd /var/lib/grafana &&
|
||||
|
||||
@@ -26,11 +26,14 @@
|
||||
* [Billy: how VictoriaMetrics deals with more than 500 billion rows](https://medium.com/@valyala/billy-how-victoriametrics-deals-with-more-than-500-billion-rows-e82ff8f725da)
|
||||
|
||||
|
||||
## Third-party articles
|
||||
## Third-party articles and slides
|
||||
|
||||
* [Better Prometheus rate() function with VictoriaMetrics](https://www.percona.com/blog/2020/02/28/better-prometheus-rate-function-with-victoriametrics/)
|
||||
* [Infrastructure monitoring with Prometheus at Zerodha](https://zerodha.tech/blog/infra-monitoring-at-zerodha/)
|
||||
* [Sismology: Iguana Solutions’ Monitoring System](https://medium.com/@IG1.com/sismology-iguana-solutions-monitoring-system-f46e4170447f)
|
||||
* [Monitoring K8S with VictoriaMetrics](https://docs.google.com/presentation/d/1g7yUyVEaAp4tPuRy-MZbPXKqJ1z78_5VKuV841aQfsg/edit)
|
||||
* [CMS monitoring R&D: Real-time monitoring and alerts](https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf)
|
||||
* [Disk usage: VictoriaMetrics vs Prometheus](https://stas.starikevich.com/posts/disk-usage-for-vm-versus-prometheus/)
|
||||
* [Benchmarking time series workloads on Apache Kudu using TSBS](https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/)
|
||||
* [What are Open Source Time Series Databases?](https://www.iunera.com/kraken/fabric/time-series-database/)
|
||||
* [Evaluating performance and correctness](https://www.robustperception.io/evaluating-performance-and-correctness)
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
Below are approved public case studies and talks from VictoriaMetrics users. Join our [community Slack channel](http://slack.victoriametrics.com/)
|
||||
and feel free asking for references, reviews and additional case studies from real VictoriaMetrics users there.
|
||||
|
||||
See also [articles about VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Articles).
|
||||
|
||||
|
||||
## Adidas
|
||||
|
||||
@@ -39,6 +41,28 @@ See [slides](https://speakerdeck.com/inletorder/monitoring-platform-with-victori
|
||||
from `Large-scale, super-load system monitoring platform built with VictoriaMetrics` talk at [Prometheus Meetup Tokyo #3](https://prometheus.connpass.com/event/157721/).
|
||||
|
||||
|
||||
## Zerodha
|
||||
|
||||
[Zerodha](https://zerodha.com/) is India's largest stock broker. Monitoring team at Zerodha faced with the following requirements:
|
||||
|
||||
* Multiple K8s clusters to monitor
|
||||
* Consistent monitoring infra for each cluster across the fleet
|
||||
* Ability to handle billions of timeseries events at any point of time
|
||||
* Easier to operate and cost effective
|
||||
|
||||
Thanos, Cortex and VictoriaMetrics were evaluated as a long-term storage for Prometheus. VictoriaMetrics has been selected due to the following reasons:
|
||||
|
||||
* Blazing fast benchmarks for a single node setup.
|
||||
* Single binary mode. Easy to scale vertically, very less operational headache.
|
||||
* Considerable [improvements on creating Histograms](https://medium.com/@valyala/improving-histogram-usability-for-prometheus-and-grafana-bc7e5df0e350).
|
||||
* [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) gives us the ability to extend PromQL with more aggregation operators.
|
||||
* API is compatible with Prometheus, almost all standard PromQL queries just work out of the box.
|
||||
* Handles storage well, with periodic compaction. Makes it easy to take snapshots.
|
||||
|
||||
See [Monitoring K8S with VictoriaMetrics](https://docs.google.com/presentation/d/1g7yUyVEaAp4tPuRy-MZbPXKqJ1z78_5VKuV841aQfsg/edit) slides,
|
||||
[video](https://youtu.be/ZJQYW-cFOms) and [Infrastructure monitoring with Prometheus at Zerodha](https://zerodha.tech/blog/infra-monitoring-at-zerodha/) blog post for more details.
|
||||
|
||||
|
||||
## Wix.com
|
||||
|
||||
[Wix.com](https://en.wikipedia.org/wiki/Wix.com) is the leading web development platform.
|
||||
@@ -209,7 +233,7 @@ Such a scheme has the following benefits comparing to Prometheus:
|
||||
|
||||
Cons are the following:
|
||||
|
||||
- VictoriaMetrics doesn't support replication - we run extra instance of VictoriaMetrics and Promxy in front of VictoriaMetrics pair for high availability.
|
||||
- VictoriaMetrics didn't support replication (it [supports replication now](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety)) - we run extra instance of VictoriaMetrics and Promxy in front of VictoriaMetrics pair for high availability.
|
||||
- VictoriaMetrics stores 1 extra month for defined retention (if retention is set to N months, then VM stores N+1 months of data), but this is still better than other solutions.
|
||||
|
||||
Some numbers from our single-node VictoriaMetrics setup:
|
||||
|
||||
@@ -16,8 +16,9 @@ Join [our Slack](http://slack.victoriametrics.com/) or [contact us](mailto:info@
|
||||
## Prominent features
|
||||
|
||||
- Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
- Performance and capacity scales horizontally.
|
||||
- Supports multiple independent namespaces for time series data (aka multi-tenancy).
|
||||
- Performance and capacity scales horizontally. See [these docs for details](#cluster-resizing-and-scalability).
|
||||
- Supports multiple independent namespaces for time series data (aka multi-tenancy). See [these docs for details](#multitenancy).
|
||||
- Supports replication. See [these docs for details](#replication-and-data-safety).
|
||||
|
||||
|
||||
## Architecture overview
|
||||
@@ -45,7 +46,8 @@ See [these docs](#url-format) for details. Some facts about tenants in VictoriaM
|
||||
* Each `accountID` and `projectID` is identified by an arbitrary 32-bit integer in the range `[0 .. 2^32)`.
|
||||
If `projectID` is missing, then it is automatically assigned to `0`. It is expected that other information about tenants
|
||||
such as auth tokens, tenant names, limits, accounting, etc. is stored in a separate relational database. This database must be managed
|
||||
by a separate service sitting in front of VictoriaMetrics cluster.
|
||||
by a separate service sitting in front of VictoriaMetrics cluster such as [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md).
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need help with creating such a service.
|
||||
|
||||
* Tenants are automatically created when the first data point is written into the given tenant.
|
||||
|
||||
@@ -110,11 +112,11 @@ Run `make package`. It will build the following docker images locally:
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package`.
|
||||
|
||||
By default images are built on top of `scratch` image. It is possible to build on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds images on top of `alpine:3.11` image:
|
||||
By default images are built on top of `alpine` image in order to improve debuggability. It is possible to build an image on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds images on top of `scratch` image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package
|
||||
ROOT_IMAGE=scratch make package
|
||||
```
|
||||
|
||||
## Operation
|
||||
@@ -130,7 +132,7 @@ A minimal cluster must contain the following nodes:
|
||||
It is recommended to run at least two nodes for each service
|
||||
for high availability purposes.
|
||||
|
||||
An http load balancer must be put in front of `vminsert` and `vmselect` nodes:
|
||||
An http load balancer such as `nginx` must be put in front of `vminsert` and `vmselect` nodes:
|
||||
- requests starting with `/insert` must be routed to port `8480` on `vminsert` nodes.
|
||||
- requests starting with `/select` must be routed to port `8481` on `vmselect` nodes.
|
||||
|
||||
@@ -156,7 +158,8 @@ By default the following TCP ports are used:
|
||||
- `vmselect` - 8481
|
||||
- `vmstorage` - 8482
|
||||
|
||||
It is recommended setting up Prometheus to scrape `/metrics` pages from all the cluster components, so they can be monitored and analyzed
|
||||
It is recommended setting up [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md)
|
||||
or Prometheus to scrape `/metrics` pages from all the cluster components, so they can be monitored and analyzed
|
||||
with [the official Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)
|
||||
or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11831).
|
||||
|
||||
@@ -167,21 +170,25 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
|
||||
- `<accountID>` is an arbitrary 32-bit integer identifying namespace for data ingestion (aka tenant). It is possible to set it as `accountID:projectID`,
|
||||
where `projectID` is also arbitrary 32-bit integer. If `projectID` isn't set, then it equals to `0`.
|
||||
- `<suffix>` may have the following values:
|
||||
- `prometheus` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
|
||||
- `influx/write` or `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
|
||||
- `prometheus` and `prometheus/api/v1/write` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
|
||||
- `influx/write` and `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/).
|
||||
- `opentsdb/api/put` - for accepting [OpenTSDB HTTP /api/put requests](http://opentsdb.net/docs/build/html/api_http/put.html).
|
||||
This handler is disabled by default. It is exposed on a distinct TCP address set via `-opentsdbHTTPListenAddr` command-line flag.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#sending-opentsdb-data-via-http-apiput-requests) for details.
|
||||
- `prometheus/api/v1/import` - for importing data obtained via `api/v1/export` on `vmselect` (see below).
|
||||
- `prometheus/api/v1/import/csv` - for importing arbitrary CSV data. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-csv-data) for details.
|
||||
|
||||
* URLs for querying: `http://<vmselect>:8481/select/<accountID>/prometheus/<suffix>`, where:
|
||||
- `<accountID>` is an arbitrary number identifying data namespace for the query (aka tenant)
|
||||
- `<suffix>` may have the following values:
|
||||
- `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries)
|
||||
- `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries)
|
||||
- `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
|
||||
- `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
|
||||
- `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
|
||||
- `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/)
|
||||
- `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details
|
||||
- `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries).
|
||||
- `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
|
||||
- `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers).
|
||||
- `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names).
|
||||
- `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values).
|
||||
- `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/).
|
||||
- `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details.
|
||||
- `api/v1/status/tsdb` - for time series stats. See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
|
||||
|
||||
* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
|
||||
Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't
|
||||
@@ -199,7 +206,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
|
||||
across `vmstorage` nodes.
|
||||
|
||||
|
||||
### Cluster resizing and scalability.
|
||||
### Cluster resizing and scalability
|
||||
|
||||
Cluster performance and capacity scales with adding new nodes.
|
||||
|
||||
@@ -246,6 +253,8 @@ Each instance type - `vminsert`, `vmselect` and `vmstorage` - can run on the mos
|
||||
* The recommended total number of vCPU cores for all the `vminsert` instances can be calculated from the ingestion rate: `vCPUs = ingestion_rate / 150K`.
|
||||
* The recommended number of vCPU cores per each `vminsert` instance should equal to the number of `vmstorage` instances in the cluster.
|
||||
* The amount of RAM per each `vminsert` instance should be 1GB or more. RAM is used as a buffer for spikes in ingestion rate.
|
||||
The maximum amount of used RAM per `vminsert` node can be tuned with `-memory.allowedPercent` command-line flag. For instance, `-memory.allowedPercent=20`
|
||||
limits the maximum amount of used RAM to 20% of the available RAM on the host system.
|
||||
* Sometimes `-rpc.disableCompression` command-line flag on `vminsert` instances could increase ingestion capacity at the cost
|
||||
of higher network bandwidth usage between `vminsert` and `vmstorage`.
|
||||
|
||||
@@ -277,7 +286,18 @@ Upgrade follows `Cluster resizing procedure` under the hood.
|
||||
|
||||
### Replication and data safety
|
||||
|
||||
VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
|
||||
In order to enable application-level replication, `-replicationFactor=N` command-line flag must be passed to `vminsert`.
|
||||
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
|
||||
For example, when `-replicationFactor=3` is passed to `vminsert`, then it replicates all the ingested data to 3 distinct `vmstorage` nodes.
|
||||
|
||||
When the replication is enabled, `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect`
|
||||
in order to de-duplicate replicated data during queries. It is OK if `-dedup.minScrapeInterval` exceeds 1ms
|
||||
when [deduplication](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#deduplication) is used additionally to replication.
|
||||
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
|
||||
so it is recommended performing regular backups. See [these docs](#backups) for details.
|
||||
|
||||
By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
|
||||
It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs),
|
||||
since they are protected from data loss and data corruption. They also provide consistently high performance
|
||||
and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime.
|
||||
@@ -285,8 +305,6 @@ HDD-based persistent disks should be enough for the majority of use cases.
|
||||
|
||||
It is recommended using durable replicated persistent volumes in Kubernetes.
|
||||
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
@@ -326,8 +344,7 @@ Due to `KISS` cluster version of VictoriaMetrics has no the following "features"
|
||||
|
||||
- Fragile gossip protocols. See [failed attempt in Thanos](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
|
||||
- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm).
|
||||
- Complex replication schemes, which may go nuts in unforesseen edge cases. The replication is offloaded to the underlying durable replicated storage
|
||||
such as [persistent disks in Google Compute Engine](https://cloud.google.com/compute/docs/disks/#pdspecs).
|
||||
- Complex replication schemes, which may go nuts in unforesseen edge cases. See [replication docs](#replication-and-data-safety) for details.
|
||||
- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability.
|
||||
- Automatic cluster resizing, which may cost you a lot of money if improperly configured.
|
||||
- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :)
|
||||
|
||||
140
docs/FAQ.md
140
docs/FAQ.md
@@ -2,64 +2,64 @@
|
||||
|
||||
### What is the main purpose of VictoriaMetrics?
|
||||
|
||||
To provide the best long-term [remote storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) solution for [Prometheus](https://prometheus.io/).
|
||||
To provide the best monitoring solution.
|
||||
|
||||
|
||||
### Who uses VictoriaMetrics?
|
||||
|
||||
See [case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
|
||||
|
||||
|
||||
### Which features does VictoriaMetrics have?
|
||||
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL).
|
||||
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
|
||||
and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4).
|
||||
[Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality).
|
||||
* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
|
||||
may be crammed into a limited storage comparing to TimescaleDB.
|
||||
* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b).
|
||||
* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, M3DB, Cortex, InfluxDB or TimescaleDB.
|
||||
See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae)
|
||||
and [comparing Thanos to VictoriaMetrics](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
|
||||
* Easy operation:
|
||||
* VictoriaMetrics consists of a single executable without external dependencies.
|
||||
* All the configuration is done via explicit command-line flags with reasonable defaults.
|
||||
* All the data is stored in a single directory pointed by `-storageDataPath` flag.
|
||||
* Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
* Supports metrics' ingestion and backfilling via the following protocols:
|
||||
* [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
|
||||
* [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
|
||||
* [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon)
|
||||
if `-graphiteListenAddr` is set.
|
||||
* [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set.
|
||||
* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors.
|
||||
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
|
||||
### Which clients do you target?
|
||||
|
||||
The following Prometheus users may be interested in VictoriaMetrics:
|
||||
- Users who don't want to bother with Prometheus' local storage operational burden - backups, replication, capacity planning, scalability, etc.
|
||||
- Users with multiple Prometheus instances who want performing arbitrary queries over all the metrics collected by their Prometheus instances (aka `global querying view`).
|
||||
- Users who want reducing costs for storing huge amounts of time series data.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#prominent-features).
|
||||
|
||||
|
||||
### How to start using VictoriaMetrics?
|
||||
|
||||
Start with [single-node version](Single-server-VictoriaMetrics). It is easy to configure and operate. It should fit the majority of use cases.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Quick-Start).
|
||||
|
||||
|
||||
### Is it safe to enable [remote write storage](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
|
||||
### What is the difference between vmagent and Prometheus?
|
||||
|
||||
Yes. Prometheus continues writing data to local storage after enabling remote storage write, so all the existing local storage data
|
||||
While both [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Prometheus may scrape Prometheus targets (aka `/metrics` pages)
|
||||
according to the provided Prometheus-compatible [scrape configs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
|
||||
and send data to multiple remote storage systems, vmagent has the following additional features:
|
||||
|
||||
- vmagent usually requires lower amounts of CPU, RAM and disk IO comparing to Prometheus when scraping big number of targets (more than 1000)
|
||||
or targets with big number of exposed metrics.
|
||||
- vmagent provides independent disk-backed buffers per each configured remote storage (aka `-remoteWrite.url`). This means that slow or temporarily unavailable storage
|
||||
doesn't prevent from sending data to healthy storage in parallel. Prometheus uses a single shared buffer for all the configured remote storage systems (aka `remote_write->url`)
|
||||
with the hardcoded retention of 2 hours.
|
||||
- vmagent may accept, relabel and filter data obtained via multiple data ingestion protocols additionally to data scraped from Prometheus targets.
|
||||
I.e. it supports both `pull` and `push` protocols for data ingestion.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#features) for details.
|
||||
- vmagent may be used in different use cases:
|
||||
- [IoT and edge monitoring](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#iot-and-edge-monitoring)
|
||||
- [Drop-in replacement for Prometheus](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#drop-in-replacement-for-prometheus)
|
||||
- [Replication and High Availability](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#replication-and-high-availability)
|
||||
- [Relabeling and Filtering](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#relabeling-and-filtering)
|
||||
- [Splitting data streams among multiple systems](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#splitting-data-streams-among-multiple-systems)
|
||||
- [Prometheus remote_write proxy](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#prometheus-remote_write-proxy)
|
||||
|
||||
|
||||
### Is it safe to enable [remote write](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) in Prometheus?
|
||||
|
||||
Yes. Prometheus continues writing data to local storage after enabling remote write, so all the existing local storage data
|
||||
and new data is available for querying via Prometheus as usual.
|
||||
|
||||
It is recommended using [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) for scraping Prometheus targets
|
||||
and writing data to VictoriaMetrics.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to other remote storage solutions for Prometheus such as [M3 from Uber](https://eng.uber.com/m3/), [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex), etc.?
|
||||
|
||||
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL with useful extensions for PromQL](MetricsQL). The simplicity is twofold:
|
||||
- It is simpler to configure and operate. There is no need in configuring third-party [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md)
|
||||
or fighting with [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
|
||||
- VictoriaMetrics has simpler architecture, which means less bugs and more useful features in the long run comparing to competing TSDBs.
|
||||
VictoriaMetrics is simpler, faster, more cost-effective and it provides [MetricsQL query language](MetricsQL) based on PromQL. The simplicity is twofold:
|
||||
- It is simpler to configure and operate. There is no need in configuring [sidecars](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md),
|
||||
fighting [gossip protocol](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md)
|
||||
or setting up third-party systems such as [Consul](https://github.com/cortexproject/cortex/issues/157), [Cassandra](https://cortexmetrics.io/docs/production/cassandra/),
|
||||
[DynamoDB](https://cortexmetrics.io/docs/production/aws/) or [Memcached](https://cortexmetrics.io/docs/production/caching/).
|
||||
- VictoriaMetrics has simpler architecture. This means less bugs and more useful features in the long run comparing to competing TSDBs.
|
||||
|
||||
See [comparing Thanos to VictoriaMetrics cluster](https://medium.com/@valyala/comparing-thanos-to-victoriametrics-cluster-b193bea1683)
|
||||
and [Remote Write Storage Wars](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) talk from [PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
@@ -70,55 +70,68 @@ VictoriaMetrics also [uses less RAM than Thanos components](https://github.com/t
|
||||
### What is the difference between VictoriaMetrics and [Cortex](https://github.com/cortexproject/cortex)?
|
||||
|
||||
VictoriaMetrics is similar to Cortex in the following aspects:
|
||||
- Both systems accept data from Prometheus via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/),
|
||||
i.e. there is no need in running sidecars unlike in [Thanos](https://github.com/thanos-io/thanos) case.
|
||||
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format).
|
||||
- Both systems accept data from [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or Prometheus
|
||||
via standard [remote_write API](https://prometheus.io/docs/practices/remote_write/), i.e. there is no need in running sidecars
|
||||
unlike in [Thanos](https://github.com/thanos-io/thanos) case.
|
||||
- Both systems support multi-tenancy out of the box. See [the corresponding docs for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#multitenancy).
|
||||
- Both systems support data replication. See [replication in Cortex](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#hashing) and [replication in VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety).
|
||||
- Both systems scale horizontally to multiple nodes. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#cluster-resizing-and-scalability) for details.
|
||||
- Both systems support alerting and recording rules via the corresponding tools such as [vmalert](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md).
|
||||
|
||||
|
||||
The main differences between Cortex and VictoriaMetrics:
|
||||
- Cortex re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
|
||||
- Cortex provides [Ruler](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ruler) and [Alertmanager](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#alertmanager) components,
|
||||
which are currently missing in VictoriaMetrics. However, these components can be substituted by [Promxy](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
- Cortex heavily relies on third-party services such as Consul, Memcache, DynamoDB, BigTable, Cassandra, etc.
|
||||
This may increase operational complexity and reduce system reliability comparing to VictoriaMetrics' case,
|
||||
which doesn't use any external services. Compare [Cortex Architecture](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md)
|
||||
to [VictoriaMetrics architecture](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#architecture-overview).
|
||||
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
|
||||
which is much easier to setup and operate than Cortex cluster.
|
||||
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/master/docs/architecture.md#ingesters-failure-and-data-loss).
|
||||
- Cortex may lose up to 12 hours of recent data on Ingestor failure - see [the corresponding docs](https://github.com/cortexproject/cortex/blob/fe56f1420099aa1bf1ce09316c186e05bddee879/docs/architecture.md#ingesters-failure-and-data-loss).
|
||||
VictoriaMetrics may lose only a few seconds of recent data, which isn't synced to persistent storage yet.
|
||||
See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
|
||||
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
- Cortex is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/) and [other case studies](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### What is the difference between VictoriaMetrics and [Thanos](https://github.com/thanos-io/thanos)?
|
||||
|
||||
- Thanos re-uses Prometheus source code, while VictoriaMetrics is written from scratch.
|
||||
- Thanos provides [Ruler component](https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md),
|
||||
while VictoriaMetrics relies on [Promxy for alerting and recording rules](https://github.com/jacksontj/promxy#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
- VictoriaMetrics accepts data via [standard remote_write API for Prometheus](https://prometheus.io/docs/practices/remote_write/),
|
||||
while Thanos uses non-standard [Sidecar](https://github.com/thanos-io/thanos/blob/master/docs/components/sidecar.md), which must run alongside each Prometheus instance.
|
||||
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage.
|
||||
- Thanos stores data on object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data on block storage (GCP persistent disks, Amazon EBS or bare metal HDD).
|
||||
- Thanos Sidecar requires disabling data compaction in Prometheus, which may hurt Prometheus performance and increase RAM usage. See [these docs](https://thanos.io/components/sidecar.md/) for more details.
|
||||
- Thanos stores data in object storage (Amazon S3 or Google GCS), while VictoriaMetrics stores data in block storage
|
||||
([GCP persistent disks](https://cloud.google.com/compute/docs/disks#pdspecs), Amazon EBS or bare metal HDD).
|
||||
While object storage is usually less expensive, block storage provides much lower latencies and higher throughput.
|
||||
VictoriaMetrics works perfectly with HDD-based block storage - there is no need in using more expensive SSD or NVMe disks in most cases.
|
||||
- Thanos may lose up to 2 hours of recent data, which wasn't uploaded yet to object storage. VictoriaMetrics may lose only a few seconds of recent data,
|
||||
which isn't synced to persistent storage yet. See [this article for details](https://medium.com/@valyala/wal-usage-looks-broken-in-modern-time-series-databases-b62a627ab704).
|
||||
- VictoriaMetrics provides [production-ready single-node solution](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md),
|
||||
which is much easier to setup and operate than Thanos components.
|
||||
- Thanos may be harder to setup and operate comparing to VictoriaMetrics, since it has more moving parts, which can be connected with less reliable networks.
|
||||
See [this article for details](https://medium.com/faun/comparing-thanos-to-victoriametrics-cluster-b193bea1683).
|
||||
- Thanos is usually slower and requires more CPU and RAM than VictoriaMetrics. See [this talk from Adidas at PromCon 2019](https://promcon.io/2019-munich/talks/remote-write-storage-wars/).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to Prometheus remote_write protocol - InfluxDB, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to [InfluxDB](https://www.influxdata.com/time-series-platform/influxdb/)?
|
||||
|
||||
VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
It is easier to configure and operate. It provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
|
||||
- VictoriaMetrics requires [10x less RAM](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) and it [works faster](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
|
||||
- VictoriaMetrics provides [better query language](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085) than InfluxQL or Flux.
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols additionally to InfluxDB - Prometheus remote_write, OpenTSDB, Graphite, CSV.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data) for details.
|
||||
|
||||
|
||||
### How does VictoriaMetrics compare to [TimescaleDB](https://www.timescale.com/)?
|
||||
|
||||
TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
|
||||
Additionally, VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data.
|
||||
- TimescaleDB insists on using SQL as a query language. While SQL is more powerful than PromQL, this power is rarely required during typical TSDB usage. Real-world queries usually [look clearer and simpler when written in PromQL than in SQL](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
|
||||
- VictoriaMetrics requires [up to 70x less storage space comparing to TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4) for storing the same amount of time series data. The gap in storage space usage can be lowered from 70x to 3x if [compression in TimescaleDB is properly configured](https://docs.timescale.com/latest/using-timescaledb/compression) (it isn't an easy task in general case :)).
|
||||
- VictoriaMetrics accepts data in multiple popular data ingestion protocols - InfluxDB, OpenTSDB, Graphite, CSV, while TimescaleDB supports only SQL inserts.
|
||||
|
||||
|
||||
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos), [Cortex](https://github.com/cortexproject/cortex)?
|
||||
### Does VictoriaMetrics use Prometheus technologies like other clustered TSDBs built on top of Prometheus such as [Thanos](https://github.com/thanos-io/thanos) or [Cortex](https://github.com/cortexproject/cortex)?
|
||||
|
||||
No. VictoriaMetrics core is written in Go from scratch by [fasthttp](https://github.com/valyala/fasthttp) [author](https://github.com/valyala).
|
||||
The architecture is [optimized for storing and querying large amounts of time series data with high cardinality](https://medium.com/devopslinks/victoriametrics-creating-the-best-remote-storage-for-prometheus-5d92d66787ac). VictoriaMetrics storage uses [certain ideas from ClickHouse](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282). Special thanks to [Alexey Milovidov](https://github.com/alexey-milovidov).
|
||||
@@ -136,6 +149,8 @@ Yes:
|
||||
* [TSBS benchmark on high-cardinality time series: VictoriaMetrics vs InfluxDB vs TimescaleDB](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
|
||||
* [Standard TSBS benchmark: VictoriaMetrics vs InfluxDB vs TimescaleDB](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
|
||||
|
||||
See also [other articles about VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Articles).
|
||||
|
||||
|
||||
### What is the pricing for VictoriaMetrics?
|
||||
|
||||
@@ -145,11 +160,11 @@ The following versions are open source and free:
|
||||
|
||||
We provide commercial support for both versions. [Contact us](mailto:info@victoriametrics.com) for the pricing.
|
||||
|
||||
The following versions are commercial:
|
||||
The following commercial versions of VictoriaMetrics are planned:
|
||||
* Managed cluster in the Cloud.
|
||||
* SaaS version.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) for the pricing.
|
||||
[Contact us](mailto:info@victoriametrics.com) for more information on our plans.
|
||||
|
||||
|
||||
### Why VictoriaMetrics doesn't support [Prometheus remote read API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#%3Cremote_read%3E)?
|
||||
@@ -168,6 +183,11 @@ or via [Prometheus datasource in Grafana](http://docs.grafana.org/features/datas
|
||||
Yes. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#deduplication) for details.
|
||||
|
||||
|
||||
### Does VictoriaMetrics support replication?
|
||||
|
||||
Yes. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
|
||||
### Where is the source code of VictoriaMetrics?
|
||||
|
||||
Source code for the following versions is available in the following places:
|
||||
|
||||
@@ -4,6 +4,8 @@ VictoriaMetrics implements MetricsQL - query language inspired by [PromQL](https
|
||||
It is backwards compatible with PromQL, so Grafana dashboards backed by Prometheus datasource should work the same after switching from Prometheus to VictoriaMetrics.
|
||||
[Standalone MetricsQL package](https://godoc.org/github.com/VictoriaMetrics/metricsql) can be used for parsing MetricsQL in external apps.
|
||||
|
||||
If you are unfamiliar with PromQL, then it is suggested reading [this tutorial for beginners](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
|
||||
|
||||
The following functionality is implemented differently in MetricsQL comparing to PromQL in order to improve user experience:
|
||||
* MetricsQL takes into account the previous point before the window in square brackets for range functions such as `rate` and `increase`.
|
||||
It also doesn't extrapolate range function results. This addresses [this issue from Prometheus](https://github.com/prometheus/prometheus/issues/3746).
|
||||
@@ -116,3 +118,7 @@ This functionality can be tried at [an editable Grafana dashboard](http://play-g
|
||||
for the given `phi` in the range `[0..1]`.
|
||||
- `last_over_time(m[d])` - returns the last value for `m` on the time range `d`.
|
||||
- `first_over_time(m[d])` - returns the first value for `m` on the time range `d`.
|
||||
- `outliersk(N, m)` - returns up to `N` outlier time series for `m`. Outlier time series have the highest deviation from the `median(m)`.
|
||||
This aggregate function is useful to detect anomalies across groups of similar time series.
|
||||
- `ascent_over_time(m[d])` - returns the sum of positive deltas between adjancent data points in `m` over `d`. Useful for tracking height gains in GPS track.
|
||||
- `descent_over_time(m[d])` - returns the absolute sum of negative deltas between adjancent data points in `m` over `d`. Useful for tracking height loss in GPS track.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Quick Start
|
||||
|
||||
1. Download the latest VictoriaMetrics release from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
from [Docker hub](https://hub.docker.com/r/valyala/victoria-metrics/)
|
||||
from [Docker hub](https://hub.docker.com/r/victoriametrics/victoria-metrics/)
|
||||
or [build it from sources](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#how-to-build-from-sources).
|
||||
|
||||
2. Run the binary or Docker image with the desired command-line flags. Pass `-help` in order to see description for all the available flags
|
||||
@@ -17,8 +17,10 @@
|
||||
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/43) in order to configure VictoriaMetrics as OS service.
|
||||
It is recommended setting up [VictoriaMetrics monitoring](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#monitoring).
|
||||
|
||||
3. Configure all the Prometheus instances to write data to VictoriaMetrics.
|
||||
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#prometheus-setup).
|
||||
3. Configure [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or Prometheus to write data to VictoriaMetrics.
|
||||
It is recommended to use `vmagent` instead of Prometheus, since it is more resource efficient. If you still prefer Prometheus, then
|
||||
see [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#prometheus-setup)
|
||||
for details on how to configure Prometheus.
|
||||
|
||||
4. Configure Grafana to query VictoriaMetrics instead of Prometheus.
|
||||
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#grafana-setup).
|
||||
|
||||
@@ -10,18 +10,25 @@
|
||||
|
||||
## VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database.
|
||||
|
||||
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
|
||||
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
|
||||
|
||||
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
|
||||
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
|
||||
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
|
||||
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
|
||||
* [Zerodha](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#zerodha)
|
||||
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
|
||||
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
|
||||
* [Synthesio](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#synthesio)
|
||||
@@ -34,6 +41,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
|
||||
## Prominent features
|
||||
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
|
||||
See [these docs](#prometheus-setup) for details.
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
|
||||
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
|
||||
@@ -116,6 +125,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
* [Monitoring](#monitoring)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Backfilling](#backfilling)
|
||||
* [Replication](#replication)
|
||||
* [Backups](#backups)
|
||||
* [Profiling](#profiling)
|
||||
* [Integrations](#integrations)
|
||||
* [Third-party contributions](#third-party-contributions)
|
||||
@@ -571,11 +582,11 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
|
||||
By default the image is built on top of `alpine` image for improved debuggability. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `scratch` image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=alpine:3.11 make package-victoria-metrics
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
@@ -763,7 +774,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -782,6 +799,8 @@ remote_write:
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
|
||||
|
||||
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
6) Set up Prometheus datasource in Grafana that points to Promxy.
|
||||
@@ -792,6 +811,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
|
||||
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
|
||||
with the enabled de-duplication. See [this section](#deduplication) for details.
|
||||
|
||||
|
||||
### Deduplication
|
||||
|
||||
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
|
||||
@@ -809,6 +829,8 @@ Data is split in per-month subdirectories inside `<-storageDataPath>/data/small`
|
||||
Directories for months outside the configured retention are deleted on the first day of new month.
|
||||
In order to keep data according to `-retentionPeriod` max disk space usage is going to be `-retentionPeriod` + 1 month.
|
||||
For example if `-retentionPeriod` is set to 1, data for January is deleted on March 1st.
|
||||
It is safe to extend `-retentionPeriod` on existing data. If `-retentionPeriod` is set to lower
|
||||
value than before then data outside the configured period will be eventually deleted.
|
||||
|
||||
### Multiple retentions
|
||||
|
||||
@@ -874,6 +896,10 @@ Consider setting the following command-line flags:
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
|
||||
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md)
|
||||
or similar auth proxy.
|
||||
|
||||
|
||||
### Tuning
|
||||
|
||||
* There is no need for VictoriaMetrics tuning since it uses reasonable defaults for command-line flags,
|
||||
@@ -910,6 +936,12 @@ The most interesting metrics are:
|
||||
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
|
||||
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
|
||||
* `sum(vm_data_size_bytes)` - the total size of data on disk.
|
||||
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -922,8 +954,9 @@ The most interesting metrics are:
|
||||
|
||||
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
|
||||
then it is likely you have too many active time series for the current amount of RAM.
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
|
||||
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
ingestion performance.
|
||||
ingestion and query performance in this case.
|
||||
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
|
||||
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
|
||||
|
||||
@@ -970,6 +1003,24 @@ the query cache, which could contain incomplete data cached during the backfilli
|
||||
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
|
||||
for data with timestamps close to the current time.
|
||||
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
|
||||
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
|
||||
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
## vmagent
|
||||
|
||||
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
|
||||
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
|
||||
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
|
||||
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
@@ -11,7 +11,7 @@ or any other Prometheus-compatible storage system that supports `remote_write` p
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
|
||||
Also, we found that users’ infrastructure is like snowflakes - never alike, and we decided to add more flexibility
|
||||
Also, we found that users’ infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
|
||||
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
|
||||
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
|
||||
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
|
||||
|
||||
|
||||
### Quick Start
|
||||
@@ -40,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
|
||||
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
|
||||
in order to replicate data concurrently to multiple remote storage systems.
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
|
||||
@@ -49,7 +48,7 @@ Example command line:
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
If you need collecting only Influx data, then the following command line would be enough:
|
||||
If you only need to collect Influx data, then the following is sufficient:
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
@@ -79,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
|
||||
#### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
#### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
|
||||
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
|
||||
|
||||
@@ -94,13 +93,13 @@ Then it sends the buffered data to the remote storage in order to prevent data g
|
||||
#### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
|
||||
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
See [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
#### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
@@ -148,6 +147,10 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
|
||||
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
|
||||
|
||||
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
|
||||
|
||||
|
||||
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
|
||||
@@ -200,12 +203,12 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
|
||||
|
||||
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
|
||||
|
||||
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
|
||||
### How to build from sources
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
`vmalert` executes a list of given MetricsQL expressions (rules) and
|
||||
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
expressions validation;
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### TODO:
|
||||
* Configuration hot reload.
|
||||
|
||||
### QuickStart
|
||||
|
||||
To build `vmalert` from sources:
|
||||
@@ -26,9 +24,9 @@ make vmalert
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of alert rules - PromQL/MetricsQL expressions to execute;
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable Alertmanager instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
@@ -38,21 +36,27 @@ Then configure `vmalert` accordingly:
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
|
||||
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata).
|
||||
|
||||
`vmalert` may be configured with `-remoteWrite` flag to write recording rules and
|
||||
alerts state in form of timeseries via remote write protocol. Alerts state will be written
|
||||
as `ALERTS` timeseries. These timeseries may be used to recover alerts state on `vmalert`
|
||||
restarts if `-remoteRead` is configured.
|
||||
|
||||
`vmalert` runs evaluation for every group in a separate goroutine.
|
||||
Rules in group evaluated one-by-one sequentially.
|
||||
|
||||
**Important:** while recording rules execution is sequential, writing of timeseries results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one.
|
||||
|
||||
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
|
||||
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
|
||||
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
|
||||
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
|
||||
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -65,35 +69,35 @@ Usage of vmalert:
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules. Default 1m (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteread.basicAuth.password string
|
||||
Optional basic auth password for -remoteread.url
|
||||
-remoteread.basicAuth.username string
|
||||
Optional basic auth username for -remoteread.url
|
||||
-remoteread.lookback duration
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteread.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remotewrite.basicAuth.password string
|
||||
Optional basic auth password for -remotewrite.url
|
||||
-remotewrite.basicAuth.username string
|
||||
Optional basic auth username for -remotewrite.url
|
||||
-remotewrite.url string
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
Path to the file with alert rules.
|
||||
@@ -109,8 +113,30 @@ Usage of vmalert:
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
|
||||
To reload configuration without `vmalert` restart send SIGHUP signal
|
||||
or send GET request to `/-/reload` endpoint.
|
||||
|
||||
### Contributing
|
||||
|
||||
`vmalert` is mostly designed and built by VictoriaMetrics community.
|
||||
Feel free to share your experience and ideas for improving this
|
||||
software. Please keep simplicity as the main priority.
|
||||
|
||||
### How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert` from the root folder of the repository.
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-prod` from the root folder of the repository.
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
|
||||
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
|
||||
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
|
||||
creation of hourly, daily, weekly and monthly backups.
|
||||
|
||||
|
||||
### Use cases
|
||||
|
||||
|
||||
@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
|
||||
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
|
||||
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
|
||||
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
|
||||
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -52,7 +53,7 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -68,8 +69,8 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
-src string
|
||||
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-storageDataPath string
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
|
||||
-version
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
|
||||
27
go.mod
27
go.mod
@@ -1,32 +1,33 @@
|
||||
module github.com/VictoriaMetrics/VictoriaMetrics
|
||||
|
||||
require (
|
||||
cloud.google.com/go v0.57.0 // indirect
|
||||
cloud.google.com/go/storage v1.7.0
|
||||
cloud.google.com/go/storage v1.8.0
|
||||
github.com/VictoriaMetrics/fastcache v1.5.7
|
||||
|
||||
// Do not use the original github.com/valyala/fasthttp because of issues
|
||||
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
|
||||
github.com/VictoriaMetrics/fasthttp v1.0.1
|
||||
github.com/VictoriaMetrics/metrics v1.11.2
|
||||
github.com/VictoriaMetrics/metricsql v0.2.1
|
||||
github.com/aws/aws-sdk-go v1.30.25
|
||||
github.com/VictoriaMetrics/metrics v1.11.3
|
||||
github.com/VictoriaMetrics/metricsql v0.2.3
|
||||
github.com/aws/aws-sdk-go v1.31.5
|
||||
github.com/cespare/xxhash/v2 v2.1.1
|
||||
github.com/golang/protobuf v1.4.1 // indirect
|
||||
github.com/golang/protobuf v1.4.2 // indirect
|
||||
github.com/golang/snappy v0.0.1
|
||||
github.com/klauspost/compress v1.10.5
|
||||
github.com/klauspost/compress v1.10.7
|
||||
github.com/valyala/fastjson v1.5.1
|
||||
github.com/valyala/fastrand v1.0.0
|
||||
github.com/valyala/gozstd v1.7.0
|
||||
github.com/valyala/histogram v1.0.1
|
||||
github.com/valyala/quicktemplate v1.5.0
|
||||
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f // indirect
|
||||
golang.org/x/mod v0.3.0 // indirect
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 // indirect
|
||||
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
|
||||
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25
|
||||
golang.org/x/tools v0.0.0-20200512001501-aaeff5de670a // indirect
|
||||
google.golang.org/api v0.24.0
|
||||
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380 // indirect
|
||||
gopkg.in/yaml.v2 v2.2.8
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 // indirect
|
||||
google.golang.org/api v0.25.0
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece // indirect
|
||||
gopkg.in/yaml.v2 v2.3.0
|
||||
honnef.co/go/tools v0.0.1-2020.1.4 // indirect
|
||||
)
|
||||
|
||||
go 1.13
|
||||
|
||||
54
go.sum
54
go.sum
@@ -18,8 +18,8 @@ cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNF
|
||||
cloud.google.com/go/bigquery v1.4.0 h1:xE3CPsOgttP4ACBePh79zTKALtXwn/Edhcr16R5hMWU=
|
||||
cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
|
||||
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
|
||||
cloud.google.com/go/bigquery v1.6.0 h1:ajp/DjpiCHO71SyIhwb83YsUGAyWuzVvMko+9xCsJLw=
|
||||
cloud.google.com/go/bigquery v1.6.0/go.mod h1:hyFDG0qSGdHNz8Q6nDN8rYIkld0q/+5uBZaelxiDLfE=
|
||||
cloud.google.com/go/bigquery v1.7.0 h1:a/O/bK/vWrYGOTFtH8di4rBxMZnmkjy+Y5LxpDwo+dA=
|
||||
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
|
||||
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
|
||||
cloud.google.com/go/datastore v1.1.0 h1:/May9ojXjRkPBNVrq+oWLqmWCkr4OU5uRY29bu0mRyQ=
|
||||
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
|
||||
@@ -33,8 +33,8 @@ cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiy
|
||||
cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
|
||||
cloud.google.com/go/storage v1.6.0 h1:UDpwYIwla4jHGzZJaEJYx1tOejbgSoNqsAfHAUYe2r8=
|
||||
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
|
||||
cloud.google.com/go/storage v1.7.0 h1:DzdLPI8Em+DEk7IzA2a10ivq3mxIEASC9GeNJ6FFt5Q=
|
||||
cloud.google.com/go/storage v1.7.0/go.mod h1:jGMIBwF+L/tL6WN/W5InNgYYu4HP0DvGB6rQ1mufWfs=
|
||||
cloud.google.com/go/storage v1.8.0 h1:86K1Gel7BQ9/WmNWn7dTKMvTLFzwtBe5FNqYbi9X35g=
|
||||
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
|
||||
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
@@ -45,12 +45,14 @@ github.com/VictoriaMetrics/fasthttp v1.0.1 h1:I7YdbswTIW63WxoFoUOSNxeOEGB46rdKUL
|
||||
github.com/VictoriaMetrics/fasthttp v1.0.1/go.mod h1:BqgsieH90PR7x97c89j+eqZDloKkDhAEQTwhLw6jw/4=
|
||||
github.com/VictoriaMetrics/metrics v1.11.2 h1:t/ceLP6SvagUqypCKU7cI7+tQn54+TIV/tGoxihHvx8=
|
||||
github.com/VictoriaMetrics/metrics v1.11.2/go.mod h1:LU2j9qq7xqZYXz8tF3/RQnB2z2MbZms5TDiIg9/NHiQ=
|
||||
github.com/VictoriaMetrics/metricsql v0.2.1 h1:OI/W2QCFiQiFULVN3ZiC/iCqZFt25rXp/O7P2NiAwYU=
|
||||
github.com/VictoriaMetrics/metricsql v0.2.1/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
|
||||
github.com/VictoriaMetrics/metrics v1.11.3 h1:eSfXc0CrquKa1VTNUvhP+dhNjLUZHQGTFfp19mYCQWE=
|
||||
github.com/VictoriaMetrics/metrics v1.11.3/go.mod h1:LU2j9qq7xqZYXz8tF3/RQnB2z2MbZms5TDiIg9/NHiQ=
|
||||
github.com/VictoriaMetrics/metricsql v0.2.3 h1:xGscDmLoeIV7+8qX/mdHnOY0vu4m+wHIVGMoy/nBovY=
|
||||
github.com/VictoriaMetrics/metricsql v0.2.3/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
|
||||
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
|
||||
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
|
||||
github.com/aws/aws-sdk-go v1.30.25 h1:89NXJkfpjnMEnsxkP8MVX+LDsoiLCSqevraLb5y4Mjk=
|
||||
github.com/aws/aws-sdk-go v1.30.25/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
|
||||
github.com/aws/aws-sdk-go v1.31.5 h1:DFA7BzTydO4etqsTja+x7UfkOKQUv1xzEluLvNk81L0=
|
||||
github.com/aws/aws-sdk-go v1.31.5/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
|
||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
@@ -95,6 +97,8 @@ github.com/golang/protobuf v1.4.0 h1:oOuy+ugB+P/kBdUnG5QaMXSIyJ1q38wWSojYCb3z5VQ
|
||||
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
|
||||
github.com/golang/protobuf v1.4.1 h1:ZFgWrT+bLgsYPirOnRfKLYJLvssAegOj/hgyMFdJZe0=
|
||||
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
|
||||
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
|
||||
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
|
||||
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
|
||||
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
@@ -128,6 +132,8 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
|
||||
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.5 h1:7q6vHIqubShURwQz8cQK6yIe/xC3IF0Vm7TGfqjewrc=
|
||||
github.com/klauspost/compress v1.10.5/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.7 h1:7rix8v8GpI3ZBb0nSozFRgbtXKv+hOe+qfEpZqybrAg=
|
||||
github.com/klauspost/compress v1.10.7/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
@@ -199,6 +205,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
|
||||
golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
|
||||
golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
@@ -221,6 +229,8 @@ golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 h1:WQ8q63x+f/zpC8Ac1s9wLElVo
|
||||
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y=
|
||||
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 h1:eDrdRpKgkcCqKZQwyZRyeFZgfqt37SL7Kv3tok06cKE=
|
||||
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
@@ -255,10 +265,11 @@ golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7w
|
||||
golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200409092240-59c9f1ba88fa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25 h1:OKbAoGs4fGM5cPLlVQLZGYkFC8OnOfgo6tt0Smf9XhM=
|
||||
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121 h1:rITEj+UZHYC927n8GT97eC3zrpzXdb/voyeOuVKS46o=
|
||||
golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
@@ -300,10 +311,10 @@ golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapK
|
||||
golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
|
||||
golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
|
||||
golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
|
||||
golang.org/x/tools v0.0.0-20200409170454-77362c5149f0/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200512001501-aaeff5de670a h1:vAa2fXRLbiVN3N/xCnodIT36K4QKZQNyQFq3hQJfQ1U=
|
||||
golang.org/x/tools v0.0.0-20200512001501-aaeff5de670a/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5 h1:3KBjmg2slvQXATWW9cQJ6tsRc8hj1gsnwWyi1IzYk3o=
|
||||
golang.org/x/tools v0.0.0-20200527150044-688b3c5d9fa5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
|
||||
@@ -319,10 +330,11 @@ google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/
|
||||
google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.21.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
|
||||
google.golang.org/api v0.24.0 h1:cG03eaksBzhfSIk7JRGctfp3lanklcOM/mTGvow7BbQ=
|
||||
google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
|
||||
google.golang.org/api v0.25.0 h1:LodzhlzZEUfhXzNUMIfVlf9Gr6Ua5MMtoFWh7+f47qA=
|
||||
google.golang.org/api v0.25.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
|
||||
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
|
||||
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
@@ -351,11 +363,13 @@ google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfG
|
||||
google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200409111301-baae70f3302d/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84 h1:pSLkPbrjnPyLDYUO2VM9mDLqo2V6CFBY84lFSZAfoi4=
|
||||
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380 h1:xriR1EgvKfkKxIoU2uUvrMVl+H26359loFFUleSMXFo=
|
||||
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
|
||||
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece h1:1YM0uhfumvoDu9sx8+RyWwTI63zoCQvI23IYFRlvte0=
|
||||
google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
|
||||
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
||||
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
|
||||
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
|
||||
@@ -365,7 +379,6 @@ google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8
|
||||
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
|
||||
google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
|
||||
google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
|
||||
google.golang.org/grpc v1.28.1/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
|
||||
google.golang.org/grpc v1.29.1 h1:EC2SB8S04d2r73uptxphDSUG+kTKVgjRPF+N3xpxRB4=
|
||||
google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
|
||||
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
||||
@@ -376,13 +389,18 @@ google.golang.org/protobuf v1.21.0 h1:qdOKuR/EIArgaWNjetjgTzgVTAZ+S/WXVrq9HW9zim
|
||||
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
|
||||
google.golang.org/protobuf v1.22.0 h1:cJv5/xdbk1NnMPR1VP9+HU6gupuG9MLBoH1r6RHZ2MY=
|
||||
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=
|
||||
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
|
||||
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
|
||||
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
|
||||
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
|
||||
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
|
||||
@@ -390,6 +408,8 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh
|
||||
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
|
||||
honnef.co/go/tools v0.0.1-2020.1.3 h1:sXmLre5bzIR6ypkjXCDI3jHPssRhc8KD/Ome589sc3U=
|
||||
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
|
||||
honnef.co/go/tools v0.0.1-2020.1.4 h1:UoveltGrhghAA7ePc+e+QYDHXrBps2PqFZiHkGR/xK8=
|
||||
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
|
||||
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
|
||||
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
|
||||
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
|
||||
|
||||
@@ -113,10 +113,16 @@ func (r *Restore) Run() error {
|
||||
partsToDelete := common.PartsDifference(dstParts, srcParts)
|
||||
deleteSize := uint64(0)
|
||||
if len(partsToDelete) > 0 {
|
||||
// Fully remove local file if certain parts from the remote part are missing.
|
||||
// Remove only files with the missing part at offset 0.
|
||||
// Assume other files are partially downloaded during the previous Restore.Run call,
|
||||
// so only the last part in them may be incomplete.
|
||||
// The last part for partially downloaded files will be re-downloaded later.
|
||||
// This addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/487 .
|
||||
pathsToDelete := make(map[string]bool)
|
||||
for _, p := range partsToDelete {
|
||||
pathsToDelete[p.Path] = true
|
||||
if p.Offset == 0 {
|
||||
pathsToDelete[p.Path] = true
|
||||
}
|
||||
}
|
||||
logger.Infof("deleting %d files from %s", len(pathsToDelete), dst)
|
||||
for path := range pathsToDelete {
|
||||
@@ -153,7 +159,8 @@ func (r *Restore) Run() error {
|
||||
logger.Infof("downloading %d parts from %s to %s", len(partsToCopy), src, dst)
|
||||
bytesDownloaded := uint64(0)
|
||||
err = runParallelPerPath(concurrency, perPath, func(parts []common.Part) error {
|
||||
// Sort partsToCopy in order to properly grow file size during downloading.
|
||||
// Sort partsToCopy in order to properly grow file size during downloading
|
||||
// and to properly resume downloading of incomplete files on the next Restore.Run call.
|
||||
common.SortParts(parts)
|
||||
for _, p := range parts {
|
||||
logger.Infof("downloading %s from %s to %s", &p, src, dst)
|
||||
@@ -169,7 +176,7 @@ func (r *Restore) Run() error {
|
||||
return fmt.Errorf("cannot download %s to %s: %s", &p, dst, err)
|
||||
}
|
||||
if err := wc.Close(); err != nil {
|
||||
return fmt.Errorf("cannot close reader fro %s from %s: %s", &p, src, err)
|
||||
return fmt.Errorf("cannot close reader from %s from %s: %s", &p, src, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -230,6 +230,6 @@ func NewRemoteFS(path string) (common.RemoteFS, error) {
|
||||
}
|
||||
return fs, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported scheme %q in `-dst`", scheme)
|
||||
return nil, fmt.Errorf("unsupported scheme %q", scheme)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,6 +210,9 @@ func (fs *FS) DeleteFile(filePath string) error {
|
||||
// The file is overwritten if it exists.
|
||||
func (fs *FS) CreateFile(filePath string, data []byte) error {
|
||||
path := filepath.Join(fs.Dir, filePath)
|
||||
if err := fs.mkdirAll(path); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ioutil.WriteFile(path, data, 0600); err != nil {
|
||||
return fmt.Errorf("cannot write %d bytes to %q: %s", len(data), path, err)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package bytesutil
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -29,5 +30,7 @@ func ToUnsafeBytes(s string) []byte {
|
||||
slh.Data = sh.Data
|
||||
slh.Len = sh.Len
|
||||
slh.Cap = sh.Len
|
||||
return *(*[]byte)(unsafe.Pointer(&slh))
|
||||
b := *(*[]byte)(unsafe.Pointer(&slh))
|
||||
runtime.KeepAlive(s)
|
||||
return b
|
||||
}
|
||||
|
||||
40
lib/fasttime/fasttime.go
Normal file
40
lib/fasttime/fasttime.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package fasttime
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
func init() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for tm := range ticker.C {
|
||||
t := uint64(tm.Unix())
|
||||
atomic.StoreUint64(¤tTimestamp, t)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var currentTimestamp = uint64(time.Now().Unix())
|
||||
|
||||
// UnixTimestamp returns the current unix timestamp in seconds.
|
||||
//
|
||||
// It is faster than time.Now().Unix()
|
||||
func UnixTimestamp() uint64 {
|
||||
return atomic.LoadUint64(¤tTimestamp)
|
||||
}
|
||||
|
||||
// UnixDate returns date from the current unix timestamp.
|
||||
//
|
||||
// The date is calculated by dividing unix timestamp by (24*3600)
|
||||
func UnixDate() uint64 {
|
||||
return UnixTimestamp() / (24 * 3600)
|
||||
}
|
||||
|
||||
// UnixHour returns hour from the current unix timestamp.
|
||||
//
|
||||
// The hour is calculated by dividing unix timestamp by 3600
|
||||
func UnixHour() uint64 {
|
||||
return UnixTimestamp() / 3600
|
||||
}
|
||||
30
lib/fasttime/fasttime_test.go
Normal file
30
lib/fasttime/fasttime_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package fasttime
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestUnixTimestamp(t *testing.T) {
|
||||
tsExpected := uint64(time.Now().Unix())
|
||||
ts := UnixTimestamp()
|
||||
if ts-tsExpected > 1 {
|
||||
t.Fatalf("unexpected UnixTimestamp; got %d; want %d", ts, tsExpected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnixDate(t *testing.T) {
|
||||
dateExpected := uint64(time.Now().Unix() / (24 * 3600))
|
||||
date := UnixDate()
|
||||
if date-dateExpected > 1 {
|
||||
t.Fatalf("unexpected UnixDate; got %d; want %d", date, dateExpected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnixHour(t *testing.T) {
|
||||
hourExpected := uint64(time.Now().Unix() / 3600)
|
||||
hour := UnixHour()
|
||||
if hour-hourExpected > 1 {
|
||||
t.Fatalf("unexpected UnixHour; got %d; want %d", hour, hourExpected)
|
||||
}
|
||||
}
|
||||
32
lib/fasttime/fasttime_timing_test.go
Normal file
32
lib/fasttime/fasttime_timing_test.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package fasttime
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func BenchmarkUnixTimestamp(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
var ts uint64
|
||||
for pb.Next() {
|
||||
ts += UnixTimestamp()
|
||||
}
|
||||
atomic.StoreUint64(&Sink, ts)
|
||||
})
|
||||
}
|
||||
|
||||
func BenchmarkTimeNowUnix(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
var ts uint64
|
||||
for pb.Next() {
|
||||
ts += uint64(time.Now().Unix())
|
||||
}
|
||||
atomic.StoreUint64(&Sink, ts)
|
||||
})
|
||||
}
|
||||
|
||||
// Sink should prevent from code elimination by optimizing compiler
|
||||
var Sink uint64
|
||||
31
lib/fs/fs.go
31
lib/fs/fs.go
@@ -6,8 +6,10 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"golang.org/x/sys/unix"
|
||||
@@ -296,6 +298,35 @@ func CreateFlockFile(dir string) (*os.File, error) {
|
||||
|
||||
// MustGetFreeSpace returns free space for the given directory path.
|
||||
func MustGetFreeSpace(path string) uint64 {
|
||||
// Try obtaining cached value at first.
|
||||
freeSpaceMapLock.Lock()
|
||||
defer freeSpaceMapLock.Unlock()
|
||||
|
||||
e, ok := freeSpaceMap[path]
|
||||
if ok && fasttime.UnixTimestamp()-e.updateTime < 2 {
|
||||
// Fast path - the entry is fresh.
|
||||
return e.freeSpace
|
||||
}
|
||||
|
||||
// Slow path.
|
||||
// Determine the amount of free space at path.
|
||||
e.freeSpace = mustGetFreeSpace(path)
|
||||
e.updateTime = fasttime.UnixTimestamp()
|
||||
freeSpaceMap[path] = e
|
||||
return e.freeSpace
|
||||
}
|
||||
|
||||
var (
|
||||
freeSpaceMap = make(map[string]freeSpaceEntry)
|
||||
freeSpaceMapLock sync.Mutex
|
||||
)
|
||||
|
||||
type freeSpaceEntry struct {
|
||||
updateTime uint64
|
||||
freeSpace uint64
|
||||
}
|
||||
|
||||
func mustGetFreeSpace(path string) uint64 {
|
||||
d, err := os.Open(path)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot determine free disk space on %q: %s", path, err)
|
||||
|
||||
@@ -4,6 +4,9 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -28,6 +31,18 @@ type MustReadAtCloser interface {
|
||||
type ReaderAt struct {
|
||||
f *os.File
|
||||
mmapData []byte
|
||||
|
||||
// pageCacheBitmap holds a bitmap for recently touched pages in mmapData.
|
||||
// This bitmap allows using simple copy() instead of copyMmap() for reading recently touched pages,
|
||||
// which is up to 4x faster when reading small chunks of data via MustReadAt.
|
||||
pageCacheBitmap atomic.Value
|
||||
pageCacheBitmapWG sync.WaitGroup
|
||||
|
||||
stopCh chan struct{}
|
||||
}
|
||||
|
||||
type pageCacheBitmap struct {
|
||||
m []uint64
|
||||
}
|
||||
|
||||
// MustReadAt reads len(p) bytes at off from r.
|
||||
@@ -35,7 +50,11 @@ func (r *ReaderAt) MustReadAt(p []byte, off int64) {
|
||||
if len(p) == 0 {
|
||||
return
|
||||
}
|
||||
if len(r.mmapData) == 0 || len(p) > 8*1024 {
|
||||
if off < 0 {
|
||||
logger.Panicf("off=%d cannot be negative", off)
|
||||
}
|
||||
end := off + int64(len(p))
|
||||
if len(r.mmapData) == 0 || (len(p) > 8*1024 && !r.isInPageCache(off, end)) {
|
||||
// Read big blocks directly from file.
|
||||
// This could be faster than reading these blocks from mmap,
|
||||
// since it triggers less page faults.
|
||||
@@ -46,18 +65,72 @@ func (r *ReaderAt) MustReadAt(p []byte, off int64) {
|
||||
if n != len(p) {
|
||||
logger.Panicf("FATAL: unexpected number of bytes read; got %d; want %d", n, len(p))
|
||||
}
|
||||
if len(r.mmapData) > 0 {
|
||||
r.markInPageCache(off, end)
|
||||
}
|
||||
} else {
|
||||
if off < 0 || off > int64(len(r.mmapData)-len(p)) {
|
||||
if off > int64(len(r.mmapData)-len(p)) {
|
||||
logger.Panicf("off=%d is out of allowed range [0...%d] for len(p)=%d", off, len(r.mmapData)-len(p), len(p))
|
||||
}
|
||||
copyMmap(p, r.mmapData[off:])
|
||||
src := r.mmapData[off:]
|
||||
if r.isInPageCache(off, end) {
|
||||
// It is safe copying the data with copy(), since it is likely it is in the page cache.
|
||||
// This is up to 4x faster than copyMmap() below.
|
||||
copy(p, src)
|
||||
} else {
|
||||
// The data may be missing in the page cache, so it is better to copy it via cgo trick
|
||||
// in order to avoid P stalls in Go runtime.
|
||||
// See https://medium.com/@valyala/mmap-in-go-considered-harmful-d92a25cb161d for details.
|
||||
copyMmap(p, src)
|
||||
r.markInPageCache(off, end)
|
||||
}
|
||||
}
|
||||
readCalls.Inc()
|
||||
readBytes.Add(len(p))
|
||||
}
|
||||
|
||||
func (r *ReaderAt) isInPageCache(start, end int64) bool {
|
||||
startBit := uint64(start) / pageSize
|
||||
endBit := uint64(end) / pageSize
|
||||
m := r.pageCacheBitmap.Load().(*pageCacheBitmap).m
|
||||
for startBit <= endBit {
|
||||
idx := startBit / 64
|
||||
off := startBit % 64
|
||||
if idx >= uint64(len(m)) {
|
||||
return true
|
||||
}
|
||||
n := atomic.LoadUint64(&m[idx])
|
||||
if (n>>off)&1 != 1 {
|
||||
return false
|
||||
}
|
||||
startBit++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *ReaderAt) markInPageCache(start, end int64) {
|
||||
startBit := uint64(start) / pageSize
|
||||
endBit := uint64(end) / pageSize
|
||||
m := r.pageCacheBitmap.Load().(*pageCacheBitmap).m
|
||||
for startBit <= endBit {
|
||||
idx := startBit / 64
|
||||
off := startBit % 64
|
||||
n := atomic.LoadUint64(&m[idx])
|
||||
n |= 1 << off
|
||||
// It is OK if multiple concurrent goroutines store the same m[idx].
|
||||
atomic.StoreUint64(&m[idx], n)
|
||||
startBit++
|
||||
}
|
||||
}
|
||||
|
||||
// Assume page size is 4KB
|
||||
const pageSize = 4 * 1024
|
||||
|
||||
// MustClose closes r.
|
||||
func (r *ReaderAt) MustClose() {
|
||||
close(r.stopCh)
|
||||
r.pageCacheBitmapWG.Wait()
|
||||
|
||||
fname := r.f.Name()
|
||||
if len(r.mmapData) > 0 {
|
||||
if err := unix.Munmap(r.mmapData); err != nil {
|
||||
@@ -80,8 +153,24 @@ func OpenReaderAt(path string) (*ReaderAt, error) {
|
||||
}
|
||||
var r ReaderAt
|
||||
r.f = f
|
||||
r.stopCh = make(chan struct{})
|
||||
if !*disableMmap {
|
||||
data, err := mmapFile(f)
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error in stat: %s", err)
|
||||
}
|
||||
size := fi.Size()
|
||||
bm := &pageCacheBitmap{
|
||||
m: make([]uint64, 1+size/pageSize/64),
|
||||
}
|
||||
r.pageCacheBitmap.Store(bm)
|
||||
r.pageCacheBitmapWG.Add(1)
|
||||
go func() {
|
||||
defer r.pageCacheBitmapWG.Done()
|
||||
pageCacheBitmapCleaner(&r.pageCacheBitmap, r.stopCh)
|
||||
}()
|
||||
|
||||
data, err := mmapFile(f, size)
|
||||
if err != nil {
|
||||
MustClose(f)
|
||||
return nil, fmt.Errorf("cannot init reader for %q: %s", path, err)
|
||||
@@ -92,18 +181,30 @@ func OpenReaderAt(path string) (*ReaderAt, error) {
|
||||
return &r, nil
|
||||
}
|
||||
|
||||
func pageCacheBitmapCleaner(pcbm *atomic.Value, stopCh <-chan struct{}) {
|
||||
t := time.NewTimer(time.Minute)
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
t.Stop()
|
||||
return
|
||||
case <-t.C:
|
||||
}
|
||||
bmOld := pcbm.Load().(*pageCacheBitmap)
|
||||
bm := &pageCacheBitmap{
|
||||
m: make([]uint64, len(bmOld.m)),
|
||||
}
|
||||
pcbm.Store(bm)
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
readCalls = metrics.NewCounter(`vm_fs_read_calls_total`)
|
||||
readBytes = metrics.NewCounter(`vm_fs_read_bytes_total`)
|
||||
readersCount = metrics.NewCounter(`vm_fs_readers`)
|
||||
)
|
||||
|
||||
func mmapFile(f *os.File) ([]byte, error) {
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error in stat: %s", err)
|
||||
}
|
||||
size := fi.Size()
|
||||
func mmapFile(f *os.File, size int64) ([]byte, error) {
|
||||
if size == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
37
lib/fs/reader_at_test.go
Normal file
37
lib/fs/reader_at_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package fs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReaderAt(t *testing.T) {
|
||||
for _, bufSize := range []int{1, 1e1, 1e2, 1e3, 1e4, 1e5} {
|
||||
t.Run(fmt.Sprintf("%d", bufSize), func(t *testing.T) {
|
||||
testReaderAt(t, bufSize)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func testReaderAt(t *testing.T, bufSize int) {
|
||||
path := "TestReaderAt"
|
||||
const fileSize = 8 * 1024 * 1024
|
||||
data := make([]byte, fileSize)
|
||||
if err := ioutil.WriteFile(path, data, 0600); err != nil {
|
||||
t.Fatalf("cannot create %q: %s", path, err)
|
||||
}
|
||||
defer MustRemoveAll(path)
|
||||
r, err := OpenReaderAt(path)
|
||||
if err != nil {
|
||||
t.Fatalf("error in OpenReaderAt(%q): %s", path, err)
|
||||
}
|
||||
defer r.MustClose()
|
||||
|
||||
buf := make([]byte, bufSize)
|
||||
for i := 0; i < fileSize-bufSize; i += bufSize {
|
||||
offset := int64(i)
|
||||
r.MustReadAt(buf[:0], offset)
|
||||
r.MustReadAt(buf, offset)
|
||||
}
|
||||
}
|
||||
56
lib/fs/reader_at_timing_test.go
Normal file
56
lib/fs/reader_at_timing_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package fs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func BenchmarkReaderAtMustReadAt(b *testing.B) {
|
||||
b.Run("mmap_on", func(b *testing.B) {
|
||||
benchmarkReaderAtMustReadAt(b, true)
|
||||
})
|
||||
b.Run("mmap_off", func(b *testing.B) {
|
||||
benchmarkReaderAtMustReadAt(b, false)
|
||||
})
|
||||
}
|
||||
|
||||
func benchmarkReaderAtMustReadAt(b *testing.B, isMmap bool) {
|
||||
prevDisableMmap := *disableMmap
|
||||
*disableMmap = !isMmap
|
||||
defer func() {
|
||||
*disableMmap = prevDisableMmap
|
||||
}()
|
||||
|
||||
path := "BenchmarkReaderAtMustReadAt"
|
||||
const fileSize = 8 * 1024 * 1024
|
||||
data := make([]byte, fileSize)
|
||||
if err := ioutil.WriteFile(path, data, 0600); err != nil {
|
||||
b.Fatalf("cannot create %q: %s", path, err)
|
||||
}
|
||||
defer MustRemoveAll(path)
|
||||
r, err := OpenReaderAt(path)
|
||||
if err != nil {
|
||||
b.Fatalf("error in OpenReaderAt(%q): %s", path, err)
|
||||
}
|
||||
defer r.MustClose()
|
||||
|
||||
b.ResetTimer()
|
||||
for _, bufSize := range []int{1, 1e1, 1e2, 1e3, 1e4, 1e5} {
|
||||
b.Run(fmt.Sprintf("%d", bufSize), func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(bufSize))
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
buf := make([]byte, bufSize)
|
||||
var offset int64
|
||||
for pb.Next() {
|
||||
if len(buf)+int(offset) > fileSize {
|
||||
offset = 0
|
||||
}
|
||||
r.MustReadAt(buf, offset)
|
||||
offset += int64(len(buf))
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -283,10 +283,10 @@ func maybeGzipResponseWriter(w http.ResponseWriter, r *http.Request) http.Respon
|
||||
ae = strings.ToLower(ae)
|
||||
n := strings.Index(ae, "gzip")
|
||||
if n < 0 {
|
||||
// Do not apply gzip encoding to the response.
|
||||
return w
|
||||
}
|
||||
h := w.Header()
|
||||
h.Set("Content-Encoding", "gzip")
|
||||
// Apply gzip encoding to the response.
|
||||
zw := getGzipWriter(w)
|
||||
bw := getBufioWriter(zw)
|
||||
zrw := &gzipResponseWriter{
|
||||
@@ -301,7 +301,14 @@ func maybeGzipResponseWriter(w http.ResponseWriter, r *http.Request) http.Respon
|
||||
//
|
||||
// The function must be called before the first w.Write* call.
|
||||
func DisableResponseCompression(w http.ResponseWriter) {
|
||||
w.Header().Del("Content-Encoding")
|
||||
zrw, ok := w.(*gzipResponseWriter)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if zrw.firstWriteDone {
|
||||
logger.Panicf("BUG: DisableResponseCompression must be called before sending the response")
|
||||
}
|
||||
zrw.disableCompression = true
|
||||
}
|
||||
|
||||
// EnableCORS enables https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
|
||||
@@ -343,20 +350,24 @@ type gzipResponseWriter struct {
|
||||
func (zrw *gzipResponseWriter) Write(p []byte) (int, error) {
|
||||
if !zrw.firstWriteDone {
|
||||
h := zrw.Header()
|
||||
if h.Get("Content-Encoding") != "gzip" {
|
||||
// The request handler disabled gzip encoding.
|
||||
// Send uncompressed response body.
|
||||
if zrw.statusCode == http.StatusNoContent {
|
||||
zrw.disableCompression = true
|
||||
} else if h.Get("Content-Type") == "" {
|
||||
// Disable auto-detection of content-type, since it
|
||||
// is incorrectly detected after the compression.
|
||||
h.Set("Content-Type", "text/html")
|
||||
}
|
||||
if h.Get("Content-Encoding") != "" {
|
||||
zrw.disableCompression = true
|
||||
}
|
||||
if !zrw.disableCompression {
|
||||
h.Set("Content-Encoding", "gzip")
|
||||
h.Del("Content-Length")
|
||||
if h.Get("Content-Type") == "" {
|
||||
// Disable auto-detection of content-type, since it
|
||||
// is incorrectly detected after the compression.
|
||||
h.Set("Content-Type", "text/html")
|
||||
}
|
||||
}
|
||||
zrw.writeHeader()
|
||||
zrw.firstWriteDone = true
|
||||
}
|
||||
if zrw.statusCode == 0 {
|
||||
zrw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
if zrw.disableCompression {
|
||||
return zrw.ResponseWriter.Write(p)
|
||||
}
|
||||
@@ -364,23 +375,25 @@ func (zrw *gzipResponseWriter) Write(p []byte) (int, error) {
|
||||
}
|
||||
|
||||
func (zrw *gzipResponseWriter) WriteHeader(statusCode int) {
|
||||
if zrw.statusCode != 0 {
|
||||
return
|
||||
}
|
||||
if statusCode == http.StatusNoContent {
|
||||
DisableResponseCompression(zrw.ResponseWriter)
|
||||
}
|
||||
zrw.ResponseWriter.WriteHeader(statusCode)
|
||||
zrw.statusCode = statusCode
|
||||
}
|
||||
|
||||
func (zrw *gzipResponseWriter) writeHeader() {
|
||||
if zrw.statusCode == 0 {
|
||||
zrw.statusCode = http.StatusOK
|
||||
}
|
||||
zrw.ResponseWriter.WriteHeader(zrw.statusCode)
|
||||
}
|
||||
|
||||
// Implements http.Flusher
|
||||
func (zrw *gzipResponseWriter) Flush() {
|
||||
if err := zrw.bw.Flush(); err != nil && !isTrivialNetworkError(err) {
|
||||
logger.Warnf("gzipResponseWriter.Flush (buffer): %s", err)
|
||||
}
|
||||
if err := zrw.zw.Flush(); err != nil && !isTrivialNetworkError(err) {
|
||||
logger.Warnf("gzipResponseWriter.Flush (gzip): %s", err)
|
||||
if !zrw.disableCompression {
|
||||
if err := zrw.bw.Flush(); err != nil && !isTrivialNetworkError(err) {
|
||||
logger.Warnf("gzipResponseWriter.Flush (buffer): %s", err)
|
||||
}
|
||||
if err := zrw.zw.Flush(); err != nil && !isTrivialNetworkError(err) {
|
||||
logger.Warnf("gzipResponseWriter.Flush (gzip): %s", err)
|
||||
}
|
||||
}
|
||||
if fw, ok := zrw.ResponseWriter.(http.Flusher); ok {
|
||||
fw.Flush()
|
||||
@@ -389,11 +402,14 @@ func (zrw *gzipResponseWriter) Flush() {
|
||||
|
||||
func (zrw *gzipResponseWriter) Close() error {
|
||||
if !zrw.firstWriteDone {
|
||||
zrw.Header().Del("Content-Encoding")
|
||||
zrw.writeHeader()
|
||||
return nil
|
||||
}
|
||||
zrw.Flush()
|
||||
err := zrw.zw.Close()
|
||||
var err error
|
||||
if !zrw.disableCompression {
|
||||
err = zrw.zw.Close()
|
||||
}
|
||||
putGzipWriter(zrw.zw)
|
||||
zrw.zw = nil
|
||||
putBufioWriter(zrw.bw)
|
||||
|
||||
@@ -24,8 +24,8 @@ func initOnce() {
|
||||
// Do not use logger.Panicf here, since logger may be uninitialized yet.
|
||||
panic(fmt.Errorf("BUG: memory.Allowed must be called only after flag.Parse call"))
|
||||
}
|
||||
if *allowedMemPercent < 10 || *allowedMemPercent > 200 {
|
||||
logger.Panicf("FATAL: -memory.allowedPercent must be in the range [10...200]; got %f", *allowedMemPercent)
|
||||
if *allowedMemPercent < 1 || *allowedMemPercent > 200 {
|
||||
logger.Panicf("FATAL: -memory.allowedPercent must be in the range [1...200]; got %f", *allowedMemPercent)
|
||||
}
|
||||
percent := *allowedMemPercent / 100
|
||||
|
||||
|
||||
@@ -148,18 +148,18 @@ func unmarshalBlockHeaders(dst []blockHeader, src []byte, blockHeadersCount int)
|
||||
for i := 0; i < blockHeadersCount; i++ {
|
||||
tail, err := dst[dstLen+i].Unmarshal(src)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot unmarshal block header: %s", err)
|
||||
return dst, fmt.Errorf("cannot unmarshal block header: %s", err)
|
||||
}
|
||||
src = tail
|
||||
}
|
||||
if len(src) > 0 {
|
||||
return nil, fmt.Errorf("unexpected non-zero tail left after unmarshaling %d block headers; len(tail)=%d", blockHeadersCount, len(src))
|
||||
return dst, fmt.Errorf("unexpected non-zero tail left after unmarshaling %d block headers; len(tail)=%d", blockHeadersCount, len(src))
|
||||
}
|
||||
newBHS := dst[dstLen:]
|
||||
|
||||
// Verify that block headers are sorted by firstItem.
|
||||
if !sort.SliceIsSorted(newBHS, func(i, j int) bool { return string(newBHS[i].firstItem) < string(newBHS[j].firstItem) }) {
|
||||
return nil, fmt.Errorf("block headers must be sorted by firstItem; unmarshaled unsorted block headers: %#v", newBHS)
|
||||
return dst, fmt.Errorf("block headers must be sorted by firstItem; unmarshaled unsorted block headers: %#v", newBHS)
|
||||
}
|
||||
|
||||
return dst, nil
|
||||
|
||||
@@ -63,9 +63,12 @@ func (bsw *blockStreamWriter) reset() {
|
||||
bsw.mrFirstItemCaught = false
|
||||
}
|
||||
|
||||
func (bsw *blockStreamWriter) InitFromInmemoryPart(ip *inmemoryPart, compressLevel int) {
|
||||
func (bsw *blockStreamWriter) InitFromInmemoryPart(ip *inmemoryPart) {
|
||||
bsw.reset()
|
||||
bsw.compressLevel = compressLevel
|
||||
|
||||
// Use the minimum compression level for in-memory blocks,
|
||||
// since they are going to be re-compressed during the merge into file-based blocks.
|
||||
bsw.compressLevel = -5 // See https://github.com/facebook/zstd/releases/tag/v1.3.4
|
||||
|
||||
bsw.metaindexWriter = &ip.metaindexData
|
||||
bsw.indexWriter = &ip.indexData
|
||||
|
||||
@@ -151,7 +151,7 @@ var isInTest = func() bool {
|
||||
return strings.HasSuffix(os.Args[0], ".test")
|
||||
}()
|
||||
|
||||
// MarshalUnsortedData marshals sorted items from ib to sb.
|
||||
// MarshalSortedData marshals sorted items from ib to sb.
|
||||
//
|
||||
// It also:
|
||||
// - appends first item to firstItemDst and returns the result.
|
||||
|
||||
@@ -29,14 +29,14 @@ func TestMultilevelMerge(t *testing.T) {
|
||||
// First level merge
|
||||
var dstIP1 inmemoryPart
|
||||
var bsw1 blockStreamWriter
|
||||
bsw1.InitFromInmemoryPart(&dstIP1, 0)
|
||||
bsw1.InitFromInmemoryPart(&dstIP1)
|
||||
if err := mergeBlockStreams(&dstIP1.ph, &bsw1, bsrs[:5], nil, nil, &itemsMerged); err != nil {
|
||||
t.Fatalf("cannot merge first level part 1: %s", err)
|
||||
}
|
||||
|
||||
var dstIP2 inmemoryPart
|
||||
var bsw2 blockStreamWriter
|
||||
bsw2.InitFromInmemoryPart(&dstIP2, 0)
|
||||
bsw2.InitFromInmemoryPart(&dstIP2)
|
||||
if err := mergeBlockStreams(&dstIP2.ph, &bsw2, bsrs[5:], nil, nil, &itemsMerged); err != nil {
|
||||
t.Fatalf("cannot merge first level part 2: %s", err)
|
||||
}
|
||||
@@ -53,7 +53,7 @@ func TestMultilevelMerge(t *testing.T) {
|
||||
newTestBlockStreamReader(&dstIP1),
|
||||
newTestBlockStreamReader(&dstIP2),
|
||||
}
|
||||
bsw.InitFromInmemoryPart(&dstIP, 0)
|
||||
bsw.InitFromInmemoryPart(&dstIP)
|
||||
if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrsTop, nil, nil, &itemsMerged); err != nil {
|
||||
t.Fatalf("cannot merge second level: %s", err)
|
||||
}
|
||||
@@ -72,7 +72,7 @@ func TestMergeForciblyStop(t *testing.T) {
|
||||
bsrs, _ := newTestInmemoryBlockStreamReaders(20, 4000)
|
||||
var dstIP inmemoryPart
|
||||
var bsw blockStreamWriter
|
||||
bsw.InitFromInmemoryPart(&dstIP, 0)
|
||||
bsw.InitFromInmemoryPart(&dstIP)
|
||||
ch := make(chan struct{})
|
||||
var itemsMerged uint64
|
||||
close(ch)
|
||||
@@ -119,7 +119,7 @@ func testMergeBlockStreamsSerial(blocksToMerge, maxItemsPerBlock int) error {
|
||||
var itemsMerged uint64
|
||||
var dstIP inmemoryPart
|
||||
var bsw blockStreamWriter
|
||||
bsw.InitFromInmemoryPart(&dstIP, 0)
|
||||
bsw.InitFromInmemoryPart(&dstIP)
|
||||
if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, nil, nil, &itemsMerged); err != nil {
|
||||
return fmt.Errorf("cannot merge block streams: %s", err)
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
@@ -227,7 +228,7 @@ func (idxbc *indexBlockCache) cleaner() {
|
||||
}
|
||||
|
||||
func (idxbc *indexBlockCache) cleanByTimeout() {
|
||||
currentTime := atomic.LoadUint64(¤tTimestamp)
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
idxbc.mu.Lock()
|
||||
for k, idxbe := range idxbc.m {
|
||||
// Delete items accessed more than 10 minutes ago.
|
||||
@@ -245,7 +246,7 @@ func (idxbc *indexBlockCache) Get(k uint64) *indexBlock {
|
||||
idxbc.mu.RUnlock()
|
||||
|
||||
if idxbe != nil {
|
||||
currentTime := atomic.LoadUint64(¤tTimestamp)
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
if atomic.LoadUint64(&idxbe.lastAccessTime) != currentTime {
|
||||
atomic.StoreUint64(&idxbe.lastAccessTime, currentTime)
|
||||
}
|
||||
@@ -256,9 +257,7 @@ func (idxbc *indexBlockCache) Get(k uint64) *indexBlock {
|
||||
}
|
||||
|
||||
// Put puts idxb under the key k into idxbc.
|
||||
//
|
||||
// Returns true if the idxb has been put into idxbc.
|
||||
func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) bool {
|
||||
func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) {
|
||||
idxbc.mu.Lock()
|
||||
|
||||
// Remove superflouos entries.
|
||||
@@ -278,12 +277,11 @@ func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) bool {
|
||||
|
||||
// Store idxb in the cache.
|
||||
idxbe := &indexBlockCacheEntry{
|
||||
lastAccessTime: atomic.LoadUint64(¤tTimestamp),
|
||||
lastAccessTime: fasttime.UnixTimestamp(),
|
||||
idxb: idxb,
|
||||
}
|
||||
idxbc.m[k] = idxbe
|
||||
idxbc.mu.Unlock()
|
||||
return true
|
||||
}
|
||||
|
||||
func (idxbc *indexBlockCache) Len() uint64 {
|
||||
@@ -377,7 +375,7 @@ func (ibc *inmemoryBlockCache) cleaner() {
|
||||
}
|
||||
|
||||
func (ibc *inmemoryBlockCache) cleanByTimeout() {
|
||||
currentTime := atomic.LoadUint64(¤tTimestamp)
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
ibc.mu.Lock()
|
||||
for k, ibe := range ibc.m {
|
||||
// Delete items accessed more than 10 minutes ago.
|
||||
@@ -396,7 +394,7 @@ func (ibc *inmemoryBlockCache) Get(k inmemoryBlockCacheKey) *inmemoryBlock {
|
||||
ibc.mu.RUnlock()
|
||||
|
||||
if ibe != nil {
|
||||
currentTime := atomic.LoadUint64(¤tTimestamp)
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
if atomic.LoadUint64(&ibe.lastAccessTime) != currentTime {
|
||||
atomic.StoreUint64(&ibe.lastAccessTime, currentTime)
|
||||
}
|
||||
@@ -407,9 +405,7 @@ func (ibc *inmemoryBlockCache) Get(k inmemoryBlockCacheKey) *inmemoryBlock {
|
||||
}
|
||||
|
||||
// Put puts ib under key k into ibc.
|
||||
//
|
||||
// Returns true if ib was put into ibc.
|
||||
func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) bool {
|
||||
func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) {
|
||||
ibc.mu.Lock()
|
||||
|
||||
// Clean superflouos entries in cache.
|
||||
@@ -429,12 +425,11 @@ func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) b
|
||||
|
||||
// Store ib in the cache.
|
||||
ibe := &inmemoryBlockCacheEntry{
|
||||
lastAccessTime: atomic.LoadUint64(¤tTimestamp),
|
||||
lastAccessTime: fasttime.UnixTimestamp(),
|
||||
ib: ib,
|
||||
}
|
||||
ibc.m[k] = ibe
|
||||
ibc.mu.Unlock()
|
||||
return true
|
||||
}
|
||||
|
||||
func (ibc *inmemoryBlockCache) Len() uint64 {
|
||||
@@ -451,16 +446,3 @@ func (ibc *inmemoryBlockCache) Requests() uint64 {
|
||||
func (ibc *inmemoryBlockCache) Misses() uint64 {
|
||||
return atomic.LoadUint64(&ibc.misses)
|
||||
}
|
||||
|
||||
func init() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for tm := range ticker.C {
|
||||
t := uint64(tm.Unix())
|
||||
atomic.StoreUint64(¤tTimestamp, t)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var currentTimestamp uint64
|
||||
|
||||
@@ -25,9 +25,6 @@ type partSearch struct {
|
||||
// The remaining block headers to scan in the current metaindexRow.
|
||||
bhs []blockHeader
|
||||
|
||||
// Pointer to index block, which may be reused.
|
||||
indexBlockReuse *indexBlock
|
||||
|
||||
// Pointer to inmemory block, which may be reused.
|
||||
inmemoryBlockReuse *inmemoryBlock
|
||||
|
||||
@@ -53,10 +50,6 @@ func (ps *partSearch) reset() {
|
||||
ps.p = nil
|
||||
ps.mrs = nil
|
||||
ps.bhs = nil
|
||||
if ps.indexBlockReuse != nil {
|
||||
putIndexBlock(ps.indexBlockReuse)
|
||||
ps.indexBlockReuse = nil
|
||||
}
|
||||
if ps.inmemoryBlockReuse != nil {
|
||||
putInmemoryBlock(ps.inmemoryBlockReuse)
|
||||
ps.inmemoryBlockReuse = nil
|
||||
@@ -275,40 +268,25 @@ func (ps *partSearch) nextBlock() error {
|
||||
}
|
||||
|
||||
func (ps *partSearch) nextBHS() error {
|
||||
if ps.indexBlockReuse != nil {
|
||||
putIndexBlock(ps.indexBlockReuse)
|
||||
ps.indexBlockReuse = nil
|
||||
}
|
||||
if len(ps.mrs) == 0 {
|
||||
return io.EOF
|
||||
}
|
||||
mr := &ps.mrs[0]
|
||||
ps.mrs = ps.mrs[1:]
|
||||
idxb, mayReuseIndexBlock, err := ps.getIndexBlock(mr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot get index block: %s", err)
|
||||
}
|
||||
if mayReuseIndexBlock {
|
||||
ps.indexBlockReuse = idxb
|
||||
idxbKey := mr.indexBlockOffset
|
||||
idxb := ps.idxbCache.Get(idxbKey)
|
||||
if idxb == nil {
|
||||
var err error
|
||||
idxb, err = ps.readIndexBlock(mr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot read index block: %s", err)
|
||||
}
|
||||
ps.idxbCache.Put(idxbKey, idxb)
|
||||
}
|
||||
ps.bhs = idxb.bhs
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ps *partSearch) getIndexBlock(mr *metaindexRow) (*indexBlock, bool, error) {
|
||||
idxbKey := mr.indexBlockOffset
|
||||
idxb := ps.idxbCache.Get(idxbKey)
|
||||
if idxb != nil {
|
||||
return idxb, false, nil
|
||||
}
|
||||
idxb, err := ps.readIndexBlock(mr)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
ok := ps.idxbCache.Put(idxbKey, idxb)
|
||||
return idxb, !ok, nil
|
||||
}
|
||||
|
||||
func (ps *partSearch) readIndexBlock(mr *metaindexRow) (*indexBlock, error) {
|
||||
ps.compressedIndexBuf = bytesutil.Resize(ps.compressedIndexBuf, int(mr.indexBlockSize))
|
||||
ps.p.indexFile.MustReadAt(ps.compressedIndexBuf, int64(mr.indexBlockOffset))
|
||||
@@ -347,8 +325,8 @@ func (ps *partSearch) getInmemoryBlock(bh *blockHeader) (*inmemoryBlock, bool, e
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
ok := ps.ibCache.Put(ibKey, ib)
|
||||
return ib, !ok, nil
|
||||
ps.ibCache.Put(ibKey, ib)
|
||||
return ib, false, nil
|
||||
}
|
||||
|
||||
func (ps *partSearch) readInmemoryBlock(bh *blockHeader) (*inmemoryBlock, error) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user