mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-30 14:06:39 +03:00
Compare commits
51 Commits
v1.92.0
...
docs/vmano
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b819157995 | ||
|
|
783dc8fd88 | ||
|
|
1d4a0796f4 | ||
|
|
527cfa2f74 | ||
|
|
fc01167962 | ||
|
|
2290252119 | ||
|
|
d890038a94 | ||
|
|
aef31f201a | ||
|
|
327f63e408 | ||
|
|
4c854c3ae2 | ||
|
|
c47138e1b0 | ||
|
|
64e24e9e2b | ||
|
|
2e4d0d0e41 | ||
|
|
1043fc1fd9 | ||
|
|
e311a7bf80 | ||
|
|
8895fb1d5f | ||
|
|
5d73a07cc3 | ||
|
|
093d43de45 | ||
|
|
061f68fe5e | ||
|
|
df37a47d4b | ||
|
|
8f4961fbbd | ||
|
|
0c3d61b211 | ||
|
|
f1a4c0b614 | ||
|
|
5aed369132 | ||
|
|
833ab331b1 | ||
|
|
6289a21d24 | ||
|
|
8d08237923 | ||
|
|
837445b81b | ||
|
|
f35d27aa2b | ||
|
|
d322ee4b35 | ||
|
|
525c44e916 | ||
|
|
216d4091f7 | ||
|
|
3b524f671b | ||
|
|
9ede3e996b | ||
|
|
4eecd4d0b3 | ||
|
|
1a43ee11d1 | ||
|
|
3f6efab6ae | ||
|
|
9abf8535ac | ||
|
|
4283eb4626 | ||
|
|
8e38efaa7b | ||
|
|
d18ff993e6 | ||
|
|
e3ef3df938 | ||
|
|
9082a84566 | ||
|
|
c25f053945 | ||
|
|
fcf5e33e6a | ||
|
|
116623a0f0 | ||
|
|
b4fca28c29 | ||
|
|
8f257889cc | ||
|
|
9f1b9b86cc | ||
|
|
f8d30a486e | ||
|
|
f3b5c9c9fb |
2
.github/workflows/check-licenses.yml
vendored
2
.github/workflows/check-licenses.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@main
|
||||
with:
|
||||
go-version: 1.20.6
|
||||
go-version: 1.20.7
|
||||
id: go
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
|
||||
2
.github/workflows/codeql-analysis.yml
vendored
2
.github/workflows/codeql-analysis.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.6
|
||||
go-version: 1.20.7
|
||||
check-latest: true
|
||||
cache: true
|
||||
if: ${{ matrix.language == 'go' }}
|
||||
|
||||
6
.github/workflows/main.yml
vendored
6
.github/workflows/main.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.6
|
||||
go-version: 1.20.7
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.6
|
||||
go-version: 1.20.7
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
id: go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.6
|
||||
go-version: 1.20.7
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
|
||||
@@ -117,6 +117,7 @@ Case studies:
|
||||
* [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x)
|
||||
* [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode)
|
||||
* [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind)
|
||||
* [Naver](https://docs.victoriametrics.com/CaseStudies.html#naver)
|
||||
* [Razorpay](https://docs.victoriametrics.com/CaseStudies.html#razorpay)
|
||||
* [Percona](https://docs.victoriametrics.com/CaseStudies.html#percona)
|
||||
* [Roblox](https://docs.victoriametrics.com/CaseStudies.html#roblox)
|
||||
@@ -1360,7 +1361,7 @@ VictoriaMetrics also may scrape Prometheus targets - see [these docs](#how-to-sc
|
||||
|
||||
## Sending data via OpenTelemetry
|
||||
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentemetry/api/v1/push` path.
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentelemetry/api/v1/push` path.
|
||||
|
||||
VictoriaMetrics expects `protobuf`-encoded requests at `/opentelemetry/api/v1/push`.
|
||||
Set HTTP request header `Content-Encoding: gzip` when sending gzip-compressed data to `/opentelemetry/api/v1/push`.
|
||||
@@ -1781,7 +1782,7 @@ created by community.
|
||||
|
||||
Graphs on the dashboards contain useful hints - hover the `i` icon in the top left corner of each graph to read it.
|
||||
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
via [vmalert](https://docs.victoriametrics.com/vmalert.html) or via Prometheus.
|
||||
|
||||
VictoriaMetrics exposes currently running queries and their execution times at `/api/v1/status/active_queries` page.
|
||||
|
||||
@@ -121,24 +121,25 @@ func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
|
||||
return float64(m().FileMergesTotal)
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_rows{type="inmemory"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_rows{type="inmemory"}`, func() float64 {
|
||||
return float64(m().InmemoryRowsCount)
|
||||
})
|
||||
ms.NewGauge(`vl_rows{type="file"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_rows{type="file"}`, func() float64 {
|
||||
return float64(m().FileRowsCount)
|
||||
})
|
||||
ms.NewGauge(`vl_parts{type="inmemory"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_parts{type="inmemory"}`, func() float64 {
|
||||
return float64(m().InmemoryParts)
|
||||
})
|
||||
ms.NewGauge(`vl_parts{type="file"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_parts{type="file"}`, func() float64 {
|
||||
return float64(m().FileParts)
|
||||
})
|
||||
ms.NewGauge(`vl_blocks{type="inmemory"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_blocks{type="inmemory"}`, func() float64 {
|
||||
return float64(m().InmemoryBlocks)
|
||||
})
|
||||
ms.NewGauge(`vl_blocks{type="file"}`, func() float64 {
|
||||
ms.NewGauge(`vl_storage_blocks{type="file"}`, func() float64 {
|
||||
return float64(m().FileBlocks)
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_partitions`, func() float64 {
|
||||
return float64(m().PartitionsCount)
|
||||
})
|
||||
@@ -146,6 +147,24 @@ func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
|
||||
return float64(m().StreamsCreatedTotal)
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_indexdb_rows`, func() float64 {
|
||||
return float64(m().IndexdbItemsCount)
|
||||
})
|
||||
ms.NewGauge(`vl_indexdb_parts`, func() float64 {
|
||||
return float64(m().IndexdbPartsCount)
|
||||
})
|
||||
ms.NewGauge(`vl_indexdb_blocks`, func() float64 {
|
||||
return float64(m().IndexdbBlocksCount)
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_data_size_bytes{type="indexdb"}`, func() float64 {
|
||||
return float64(m().IndexdbSizeBytes)
|
||||
})
|
||||
ms.NewGauge(`vl_data_size_bytes{type="storage"}`, func() float64 {
|
||||
dm := m()
|
||||
return float64(dm.CompressedInmemorySize + dm.CompressedFileSize)
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_compressed_data_size_bytes{type="inmemory"}`, func() float64 {
|
||||
return float64(m().CompressedInmemorySize)
|
||||
})
|
||||
|
||||
@@ -1571,7 +1571,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.sendTimeout array
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.shardByURL
|
||||
Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages
|
||||
|
||||
@@ -29,7 +29,7 @@ var (
|
||||
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
|
||||
"By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
|
||||
"is sent after temporary unavailability of the remote storage")
|
||||
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to the corresponding -remoteWrite.url")
|
||||
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)")
|
||||
proxyURL = flagutil.NewArrayString("remoteWrite.proxyURL", "Optional proxy URL for writing data to the corresponding -remoteWrite.url. "+
|
||||
"Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234")
|
||||
|
||||
|
||||
@@ -74,7 +74,6 @@ test-vmalert:
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
go test -v -race -cover ./app/vmalert/remotewrite
|
||||
go test -v -race -cover ./app/vmalert/utils
|
||||
go test -v -race -cover ./app/vmalert/unittest
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules/rules2-good.rules \
|
||||
@@ -104,10 +103,6 @@ replay-vmalert: vmalert
|
||||
-replay.timeFrom=2021-05-11T07:21:43Z \
|
||||
-replay.timeTo=2021-05-29T18:40:43Z
|
||||
|
||||
unittest-vmalert: vmalert
|
||||
./bin/vmalert -unittestFile=app/vmalert/unittest/testdata/test1.yaml \
|
||||
-unittestFile=app/vmalert/unittest/testdata/test2.yaml
|
||||
|
||||
vmalert-linux-amd64:
|
||||
APP_NAME=vmalert CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
|
||||
@@ -361,9 +361,9 @@ For recording rules to work `-remoteWrite.url` must be specified.
|
||||
|
||||
### Alerts state on restarts
|
||||
|
||||
`vmalert` is stateless, it holds alerts state in the process memory. Restarting of `vmalert` process
|
||||
will reset alerts state in memory. To prevent `vmalert` from losing alerts state it should be configured
|
||||
to persist the state to the remote destination via the following flags:
|
||||
`vmalert` holds alerts state in the memory. Restart of the `vmalert` process will reset the state of all active alerts
|
||||
in the memory. To prevent `vmalert` from losing the state on restarts configure it to persist the state
|
||||
to the remote database via the following flags:
|
||||
|
||||
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state
|
||||
to the configured address in the form of [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
|
||||
@@ -378,7 +378,7 @@ to persist the state to the remote destination via the following flags:
|
||||
Both flags are required for proper state restoration. Restore process may fail if time series are missing
|
||||
in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`)
|
||||
or received state doesn't match current `vmalert` rules configuration. `vmalert` marks successfully restored rules
|
||||
with `restored` label in [web UI](#WEB).
|
||||
with `restored` label in [web UI](#web).
|
||||
|
||||
### Multitenancy
|
||||
|
||||
@@ -519,7 +519,7 @@ Alertmanagers.
|
||||
|
||||
To avoid recording rules results and alerts state duplication in VictoriaMetrics server
|
||||
don't forget to configure [deduplication](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).
|
||||
The recommended value for `-dedup.minScrapeInterval` must be greater or equal to vmalert `evaluation_interval`.
|
||||
The recommended value for `-dedup.minScrapeInterval` must be multiple of vmalert's `evaluation_interval`.
|
||||
If you observe inconsistent or "jumping" values in series produced by vmalert, try disabling `-datasource.queryTimeAlignment`
|
||||
command line flag. Because of alignment, two or more vmalert HA pairs will produce results with the same timestamps.
|
||||
But due of backfilling (data delivered to the datasource with some delay) values of such results may differ,
|
||||
@@ -742,249 +742,6 @@ See full description for these flags in `./vmalert -help`.
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
> Unit testing is available from v1.92.0.
|
||||
> Unit tests do not respect `-clusterMode` for now.
|
||||
|
||||
You can use `vmalert` to run unit tests for alerting and recording rules.
|
||||
In unit test mode vmalert performs the following actions:
|
||||
* sets up an isolated VictoriaMetrics instance;
|
||||
* simulates the periodic ingestion of time series;
|
||||
* queries the ingested data for recording and alerting rules evaluation;
|
||||
* tests whether the firing alerts or resulting recording rules match the expected results.
|
||||
|
||||
See how to run vmalert in unit test mode below:
|
||||
```
|
||||
# Run vmalert with one or multiple test files via -unittestFile cmd-line flag
|
||||
./vmalert -unittestFile=test1.yaml -unittestFile=test2.yaml
|
||||
```
|
||||
|
||||
vmalert is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
|
||||
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert
|
||||
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
|
||||
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
|
||||
|
||||
### Test file format
|
||||
|
||||
The configuration format for files specified in `-unittestFile` cmd-line flag is the following:
|
||||
```
|
||||
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
|
||||
# Enterprise version of vmalert supports S3 and GCS paths to rules.
|
||||
rule_files:
|
||||
[ - <string> ]
|
||||
|
||||
# The evaluation interval for rules specified in `rule_files`
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
# Groups listed below will be evaluated by order.
|
||||
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
|
||||
group_eval_order:
|
||||
[ - <string> ]
|
||||
|
||||
# The list of unit test files to be checked during evaluation.
|
||||
tests:
|
||||
[ - <test_group> ]
|
||||
```
|
||||
|
||||
#### `<test_group>`
|
||||
|
||||
```
|
||||
# Interval between samples for input series
|
||||
interval: <duration>
|
||||
# Time series to persist into the database according to configured <interval> before running tests.
|
||||
input_series:
|
||||
[ - <series> ]
|
||||
|
||||
# Name of the test group, optional
|
||||
[ name: <string> ]
|
||||
|
||||
# Unit tests for alerting rules
|
||||
alert_rule_test:
|
||||
[ - <alert_test_case> ]
|
||||
|
||||
# Unit tests for Metricsql expressions.
|
||||
metricsql_expr_test:
|
||||
[ - <metricsql_expr_test> ]
|
||||
|
||||
# External labels accessible for templating.
|
||||
external_labels:
|
||||
[ <labelname>: <string> ... ]
|
||||
|
||||
```
|
||||
|
||||
#### `<series>`
|
||||
|
||||
```
|
||||
# series in the following format '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
series: <string>
|
||||
|
||||
# values support several special equations:
|
||||
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
|
||||
# Read this as series starts at a, then c further samples incrementing by b.
|
||||
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
|
||||
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
|
||||
# '_' represents a missing sample from scrape
|
||||
# 'stale' indicates a stale sample
|
||||
# Examples:
|
||||
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
|
||||
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
|
||||
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
|
||||
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
|
||||
values: <string>
|
||||
```
|
||||
|
||||
#### `<alert_test_case>`
|
||||
|
||||
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
|
||||
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
|
||||
but no need to add them under `exp_alerts`.
|
||||
You can also pass `--disableAlertgroupLabel` to prevent vmalert from adding `alertgroup` label.
|
||||
|
||||
```
|
||||
# The time elapsed from time=0s when this alerting rule should be checked.
|
||||
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
|
||||
eval_time: <duration>
|
||||
|
||||
# Name of the group name to be tested.
|
||||
groupname: <string>
|
||||
|
||||
# Name of the alert to be tested.
|
||||
alertname: <string>
|
||||
|
||||
# List of the expected alerts that are firing under the given alertname at
|
||||
# the given evaluation time. If you want to test if an alerting rule should
|
||||
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
|
||||
exp_alerts:
|
||||
[ - <alert> ]
|
||||
```
|
||||
|
||||
#### `<alert>`
|
||||
|
||||
```
|
||||
# These are the expanded labels and annotations of the expected alert.
|
||||
# Note: labels also include the labels of the sample associated with the alert
|
||||
exp_labels:
|
||||
[ <labelname>: <string> ]
|
||||
exp_annotations:
|
||||
[ <labelname>: <string> ]
|
||||
```
|
||||
|
||||
#### `<metricsql_expr_test>`
|
||||
|
||||
```
|
||||
# Expression to evaluate
|
||||
expr: <string>
|
||||
|
||||
# The time elapsed from time=0s when this expression be evaluated.
|
||||
eval_time: <duration>
|
||||
|
||||
# Expected samples at the given evaluation time.
|
||||
exp_samples:
|
||||
[ - <sample> ]
|
||||
```
|
||||
|
||||
#### `<sample>`
|
||||
|
||||
```
|
||||
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
labels: <string>
|
||||
|
||||
# The expected value of the Metricsql expression.
|
||||
value: <number>
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
This is an example input file for unit testing which will pass.
|
||||
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
|
||||
|
||||
With `rules.yaml` in the same directory, run `./vmalert -unittestFile=./unittest/testdata/test.yaml`.
|
||||
|
||||
#### `test.yaml`
|
||||
|
||||
```
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="prometheus", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: prometheus
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
```
|
||||
|
||||
#### `alerts.yaml`
|
||||
|
||||
```
|
||||
# This is the rules file.
|
||||
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
@@ -1486,8 +1243,6 @@ The shortlist of configuration flags is the following:
|
||||
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
|
||||
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-rule.configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
|
||||
-rule.maxResolveDuration duration
|
||||
Limits the maximum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group.
|
||||
-rule.resendDelay duration
|
||||
@@ -1535,11 +1290,6 @@ The shortlist of configuration flags is the following:
|
||||
Path to file with TLS key if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated
|
||||
-tlsMinVersion string
|
||||
Optional minimum TLS version to use for incoming requests over HTTPS if -tls is set. Supported values: TLS10, TLS11, TLS12, TLS13
|
||||
-unittestFile array
|
||||
Path to the unit test files. When set, vmalert starts in unit test mode and performs only tests on configured files.
|
||||
Examples:
|
||||
-unittestFile="./unittest/testdata/test1.yaml,./unittest/testdata/test2.yaml".
|
||||
See more information here https://docs.victoriametrics.com/vmalert.html#unit-testing-for-rules.
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
@@ -1607,7 +1357,7 @@ dns_sd_configs:
|
||||
port: 9093
|
||||
```
|
||||
|
||||
The list of configured or discovered Notifiers can be explored via [UI](#Web).
|
||||
The list of configured or discovered Notifiers can be explored via [UI](#web).
|
||||
If Alertmanager runs in cluster mode then all its URLs needs to be available during discovery
|
||||
to ensure [high availability](https://github.com/prometheus/alertmanager#high-availability).
|
||||
|
||||
|
||||
@@ -274,7 +274,7 @@ func (g *Group) close() {
|
||||
|
||||
var skipRandSleepOnGroupStart bool
|
||||
|
||||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
|
||||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
@@ -422,7 +422,7 @@ type executor struct {
|
||||
notifiers func() []notifier.Notifier
|
||||
notifierHeaders map[string]string
|
||||
|
||||
rw remotewrite.RWClient
|
||||
rw *remotewrite.Client
|
||||
|
||||
previouslySentSeriesToRWMu sync.Mutex
|
||||
// previouslySentSeriesToRW stores series sent to RW on previous iteration
|
||||
|
||||
@@ -55,9 +55,6 @@ absolute path to all .tpl files in root.
|
||||
-rule.templates="dir/**/*.tpl". Includes all the .tpl files in "dir" subfolders recursively.
|
||||
`)
|
||||
|
||||
rulesCheckInterval = flag.Duration("rule.configCheckInterval", 0, "Interval for checking for changes in '-rule' files. "+
|
||||
"By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead")
|
||||
|
||||
configCheckInterval = flag.Duration("configCheckInterval", 0, "Interval for checking for changes in '-rule' or '-notifier.config' files. "+
|
||||
"By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.")
|
||||
|
||||
@@ -91,12 +88,7 @@ absolute path to all .tpl files in root.
|
||||
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
|
||||
unitTestFiles = flagutil.NewArrayString("unittestFile", `Path to the unit test files. When set, vmalert starts in unit test mode and performs only tests on configured files.
|
||||
Examples:
|
||||
-unittestFile="./unittest/testdata/test1.yaml,./unittest/testdata/test2.yaml".
|
||||
See more information here https://docs.victoriametrics.com/vmalert.html#unit-testing-for-rules.
|
||||
`)
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
|
||||
)
|
||||
|
||||
var alertURLGeneratorFn notifier.AlertURLGenerator
|
||||
@@ -122,13 +114,6 @@ func main() {
|
||||
logger.Fatalf("failed to parse %q: %s", *ruleTemplatesPath, err)
|
||||
}
|
||||
|
||||
if len(*unitTestFiles) > 0 {
|
||||
if unitRule(*unitTestFiles...) {
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
if *dryRun {
|
||||
groups, err := config.Parse(*rulePath, notifier.ValidateTemplates, true)
|
||||
if err != nil {
|
||||
@@ -323,10 +308,6 @@ See the docs at https://docs.victoriametrics.com/vmalert.html .
|
||||
func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sighupCh <-chan os.Signal) {
|
||||
var configCheckCh <-chan time.Time
|
||||
checkInterval := *configCheckInterval
|
||||
if checkInterval == 0 && *rulesCheckInterval > 0 {
|
||||
logger.Warnf("flag `rule.configCheckInterval` is deprecated - use `configCheckInterval` instead")
|
||||
checkInterval = *rulesCheckInterval
|
||||
}
|
||||
if checkInterval > 0 {
|
||||
ticker := time.NewTicker(checkInterval)
|
||||
configCheckCh = ticker.C
|
||||
@@ -338,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||
validateTplFn = notifier.ValidateTemplates
|
||||
}
|
||||
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
setConfigSuccess(fasttime.UnixTimestamp())
|
||||
// init metrics for config state with positive values to improve alerting conditions
|
||||
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||
|
||||
parseFn := config.Parse
|
||||
for {
|
||||
select {
|
||||
@@ -377,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||
}
|
||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||
templates.Reload()
|
||||
// set success to 1 since previous reload
|
||||
// could have been unsuccessful
|
||||
// set success to 1 since previous reload could have been unsuccessful
|
||||
// do not update configTimestamp as config version remains old.
|
||||
configSuccess.Set(1)
|
||||
setConfigError(nil)
|
||||
// config didn't change - skip it
|
||||
// reset the last config error since the config change was rolled back
|
||||
setLastConfigErr(nil)
|
||||
// config didn't change - skip iteration
|
||||
continue
|
||||
}
|
||||
if err := m.update(ctx, newGroupsCfg, false); err != nil {
|
||||
@@ -391,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||
}
|
||||
templates.Reload()
|
||||
groupsCfg = newGroupsCfg
|
||||
setConfigSuccess(fasttime.UnixTimestamp())
|
||||
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}
|
||||
@@ -408,39 +391,36 @@ func configsEqual(a, b []config.Group) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// setConfigSuccess sets config reload status to 1.
|
||||
func setConfigSuccess(at uint64) {
|
||||
// setConfigSuccessAt updates config related metrics as successful.
|
||||
func setConfigSuccessAt(at uint64) {
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(at)
|
||||
// reset the error if any
|
||||
setConfigErr(nil)
|
||||
// reset the lastConfigErr
|
||||
setLastConfigErr(nil)
|
||||
}
|
||||
|
||||
// setConfigError sets config reload status to 0.
|
||||
// setConfigError updates config related metrics according to the error.
|
||||
func setConfigError(err error) {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
setConfigErr(err)
|
||||
setLastConfigErr(err)
|
||||
}
|
||||
|
||||
var (
|
||||
configErrMu sync.RWMutex
|
||||
// configErr represent the error message from the last
|
||||
// config reload.
|
||||
configErr error
|
||||
lastConfigErrMu sync.RWMutex
|
||||
// lastConfigErr represent the error message from the last config reload.
|
||||
// The message is used in web UI as notification
|
||||
lastConfigErr error
|
||||
)
|
||||
|
||||
func setConfigErr(err error) {
|
||||
configErrMu.Lock()
|
||||
configErr = err
|
||||
configErrMu.Unlock()
|
||||
func setLastConfigErr(err error) {
|
||||
lastConfigErrMu.Lock()
|
||||
lastConfigErr = err
|
||||
lastConfigErrMu.Unlock()
|
||||
}
|
||||
|
||||
func configError() error {
|
||||
configErrMu.RLock()
|
||||
defer configErrMu.RUnlock()
|
||||
if configErr != nil {
|
||||
return configErr
|
||||
}
|
||||
return nil
|
||||
func getLastConfigError() error {
|
||||
lastConfigErrMu.RLock()
|
||||
defer lastConfigErrMu.RUnlock()
|
||||
return lastConfigErr
|
||||
}
|
||||
|
||||
@@ -87,12 +87,13 @@ groups:
|
||||
)
|
||||
|
||||
f, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
writeToFile(t, f.Name(), rules1)
|
||||
|
||||
*rulesCheckInterval = 200 * time.Millisecond
|
||||
*configCheckInterval = 200 * time.Millisecond
|
||||
*rulePath = []string{f.Name()}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
@@ -117,14 +118,37 @@ groups:
|
||||
return len(m.groups)
|
||||
}
|
||||
|
||||
time.Sleep(*rulesCheckInterval * 2)
|
||||
checkCfg := func(err error) {
|
||||
cErr := getLastConfigError()
|
||||
cfgSuc := configSuccess.Get()
|
||||
if err != nil {
|
||||
if cErr == nil {
|
||||
t.Fatalf("expected to have config error %s; got nil instead", cErr)
|
||||
}
|
||||
if cfgSuc != 0 {
|
||||
t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if cErr != nil {
|
||||
t.Fatalf("unexpected config error: %s", cErr)
|
||||
}
|
||||
if cfgSuc != 1 {
|
||||
t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc)
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(*configCheckInterval * 2)
|
||||
checkCfg(nil)
|
||||
groupsLen := lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
}
|
||||
|
||||
writeToFile(t, f.Name(), rules2)
|
||||
time.Sleep(*rulesCheckInterval * 2)
|
||||
time.Sleep(*configCheckInterval * 2)
|
||||
checkCfg(nil)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 2 {
|
||||
fmt.Println(m.groups)
|
||||
@@ -133,7 +157,8 @@ groups:
|
||||
|
||||
writeToFile(t, f.Name(), rules1)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*rulesCheckInterval / 2)
|
||||
time.Sleep(*configCheckInterval / 2)
|
||||
checkCfg(nil)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
@@ -141,7 +166,8 @@ groups:
|
||||
|
||||
writeToFile(t, f.Name(), `corrupted`)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*rulesCheckInterval / 2)
|
||||
time.Sleep(*configCheckInterval / 2)
|
||||
checkCfg(fmt.Errorf("config error"))
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 { // should remain unchanged
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
|
||||
@@ -19,7 +19,7 @@ type manager struct {
|
||||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers func() []notifier.Notifier
|
||||
|
||||
rw remotewrite.RWClient
|
||||
rw *remotewrite.Client
|
||||
// remote read builder.
|
||||
rr datasource.QuerierBuilder
|
||||
|
||||
|
||||
@@ -257,7 +257,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
func TestManagerUpdateNegative(t *testing.T) {
|
||||
testCases := []struct {
|
||||
notifiers []notifier.Notifier
|
||||
rw remotewrite.RWClient
|
||||
rw *remotewrite.Client
|
||||
cfg config.Group
|
||||
expErr string
|
||||
}{
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
|
||||
func TestConfigWatcherReload(t *testing.T) {
|
||||
f, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -36,6 +37,7 @@ static_configs:
|
||||
}
|
||||
|
||||
f2, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f2.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -63,6 +65,7 @@ func TestConfigWatcherStart(t *testing.T) {
|
||||
defer consulSDServer.Close()
|
||||
|
||||
consulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(consulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -109,6 +112,7 @@ func TestConfigWatcherReloadConcurrent(t *testing.T) {
|
||||
defer consulSDServer2.Close()
|
||||
|
||||
consulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(consulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -125,6 +129,7 @@ consul_sd_configs:
|
||||
`, consulSDServer1.URL, consulSDServer2.URL))
|
||||
|
||||
staticAndConsulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(staticAndConsulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -1,320 +0,0 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with a HTTP 2xx status code when the write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
||||
@@ -1,97 +0,0 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// DebugClient won't push series periodically, but will write data to remote endpoint
|
||||
// immediately when Push() is called
|
||||
type DebugClient struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewDebugClient initiates and returns a new DebugClient
|
||||
func NewDebugClient() (*DebugClient, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &DebugClient{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: t,
|
||||
},
|
||||
addr: strings.TrimSuffix(*addr, "/"),
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push sends the given timeseries to the remote storage.
|
||||
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
|
||||
c.wg.Add(1)
|
||||
defer c.wg.Done()
|
||||
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal the given time series: %w", err)
|
||||
}
|
||||
|
||||
return c.send(data)
|
||||
}
|
||||
|
||||
// Close stops the DebugClient
|
||||
func (c *DebugClient) Close() error {
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *DebugClient) send(data []byte) error {
|
||||
b := snappy.Encode(nil, data)
|
||||
r := bytes.NewReader(b)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode/100 == 2 {
|
||||
return nil
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestDebugClient_Push(t *testing.T) {
|
||||
testSrv := newRWServer()
|
||||
oldAddr := *addr
|
||||
*addr = testSrv.URL
|
||||
defer func() {
|
||||
*addr = oldAddr
|
||||
}()
|
||||
|
||||
client, err := NewDebugClient()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create debug client: %s", err)
|
||||
}
|
||||
|
||||
const rowsN = 100
|
||||
var sent int
|
||||
for i := 0; i < rowsN; i++ {
|
||||
s := prompbmarshal.TimeSeries{
|
||||
Samples: []prompbmarshal.Sample{{
|
||||
Value: float64(i),
|
||||
Timestamp: time.Now().Unix(),
|
||||
}},
|
||||
}
|
||||
err := client.Push(s)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
if err == nil {
|
||||
sent++
|
||||
}
|
||||
}
|
||||
if sent == 0 {
|
||||
t.Fatalf("0 series sent")
|
||||
}
|
||||
if err := client.Close(); err != nil {
|
||||
t.Fatalf("failed to close client: %s", err)
|
||||
}
|
||||
got := testSrv.accepted()
|
||||
if got != sent {
|
||||
t.Fatalf("expected to have %d series; got %d", sent, got)
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,320 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// RWClient represents an HTTP client for pushing data via remote write protocol
|
||||
type RWClient interface {
|
||||
// Push pushes the give time series to remote storage
|
||||
Push(s prompbmarshal.TimeSeries) error
|
||||
// Close stops the client. Client can't be reused after Close call.
|
||||
Close() error
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with a HTTP 2xx status code when the write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ var (
|
||||
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
|
||||
)
|
||||
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
|
||||
if *replayMaxDatapoints < 1 {
|
||||
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
|
||||
}
|
||||
@@ -78,7 +78,7 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewri
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) replay(start, end time.Time, rw remotewrite.RWClient) int {
|
||||
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(*replayMaxDatapoints)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
@@ -119,7 +119,7 @@ func (g *Group) replay(start, end time.Time, rw remotewrite.RWClient) int {
|
||||
return total
|
||||
}
|
||||
|
||||
func replayRule(rule Rule, start, end time.Time, rw remotewrite.RWClient) (int, error) {
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < *replayRuleRetryAttempts; i++ {
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestUnitRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
disableGroupLabel bool
|
||||
files []string
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
name: "run multi files",
|
||||
files: []string{"./unittest/testdata/test1.yaml", "./unittest/testdata/test2.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "disable group label",
|
||||
disableGroupLabel: true,
|
||||
files: []string{"./unittest/testdata/disable-group-label.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "failing test",
|
||||
files: []string{"./unittest/testdata/failed-test.yaml"},
|
||||
failed: true,
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
oldFlag := *disableAlertGroupLabel
|
||||
*disableAlertGroupLabel = tc.disableGroupLabel
|
||||
fail := unitRule(tc.files...)
|
||||
if fail != tc.failed {
|
||||
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
|
||||
}
|
||||
*disableAlertGroupLabel = oldFlag
|
||||
}
|
||||
}
|
||||
@@ -1,436 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/unittest"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
storagePath string
|
||||
// insert series from 1970-01-01T00:00:00
|
||||
testStartTime = time.Unix(0, 0).UTC()
|
||||
|
||||
testPromWriteHTTPPath = "http://127.0.0.1" + *httpListenAddr + "/api/v1/write"
|
||||
testDataSourcePath = "http://127.0.0.1" + *httpListenAddr + "/prometheus"
|
||||
testRemoteWritePath = "http://127.0.0.1" + *httpListenAddr
|
||||
testHealthHTTPPath = "http://127.0.0.1" + *httpListenAddr + "/health"
|
||||
)
|
||||
|
||||
const (
|
||||
testStoragePath = "vmalert-unittest"
|
||||
testLogLevel = "ERROR"
|
||||
)
|
||||
|
||||
func unitRule(files ...string) bool {
|
||||
storagePath = filepath.Join(os.TempDir(), testStoragePath)
|
||||
processFlags()
|
||||
vminsert.Init()
|
||||
vmselect.Init()
|
||||
// storagePath will be created again when closing vmselect, so remove it again.
|
||||
defer fs.MustRemoveAll(storagePath)
|
||||
defer vminsert.Stop()
|
||||
defer vmselect.Stop()
|
||||
return rulesUnitTest(files...)
|
||||
}
|
||||
|
||||
func rulesUnitTest(files ...string) bool {
|
||||
var failed bool
|
||||
for _, f := range files {
|
||||
if err := ruleUnitTest(f); err != nil {
|
||||
fmt.Println(" FAILED")
|
||||
fmt.Printf("\nfailed to run unit test for file %q: \n%s", f, err)
|
||||
failed = true
|
||||
} else {
|
||||
fmt.Println(" SUCCESS")
|
||||
}
|
||||
}
|
||||
return failed
|
||||
}
|
||||
|
||||
func ruleUnitTest(filename string) []error {
|
||||
fmt.Println("\nUnit Testing: ", filename)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to read file: %w", err)}
|
||||
}
|
||||
|
||||
var unitTestInp unitTestFile
|
||||
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
|
||||
}
|
||||
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
if unitTestInp.EvaluationInterval.Duration() == 0 {
|
||||
fmt.Println("evaluation_interval set to 1m by default")
|
||||
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
|
||||
}
|
||||
|
||||
groupOrderMap := make(map[string]int)
|
||||
for i, gn := range unitTestInp.GroupEvalOrder {
|
||||
if _, ok := groupOrderMap[gn]; ok {
|
||||
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
|
||||
}
|
||||
groupOrderMap[gn] = i
|
||||
}
|
||||
|
||||
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
var errs []error
|
||||
for _, t := range unitTestInp.Tests {
|
||||
if err := verifyTestGroup(t); err != nil {
|
||||
errs = append(errs, err)
|
||||
continue
|
||||
}
|
||||
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
|
||||
errs = append(errs, testErrs...)
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyTestGroup(group testGroup) error {
|
||||
var testGroupName string
|
||||
if group.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
|
||||
}
|
||||
for _, at := range group.AlertRuleTests {
|
||||
if at.Alertname == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
|
||||
}
|
||||
if !*disableAlertGroupLabel && at.GroupName == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
|
||||
}
|
||||
if at.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
for _, et := range group.MetricsqlExprTests {
|
||||
if et.Expr == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
|
||||
}
|
||||
if et.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func processFlags() {
|
||||
flag.Parse()
|
||||
for _, fv := range []struct {
|
||||
flag string
|
||||
value string
|
||||
}{
|
||||
{flag: "storageDataPath", value: storagePath},
|
||||
{flag: "loggerLevel", value: testLogLevel},
|
||||
{flag: "search.disableCache", value: "true"},
|
||||
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
|
||||
{flag: "retentionPeriod", value: "100y"},
|
||||
{flag: "datasource.url", value: testDataSourcePath},
|
||||
{flag: "remoteWrite.url", value: testRemoteWritePath},
|
||||
} {
|
||||
// panics if flag doesn't exist
|
||||
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
|
||||
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func setUp() {
|
||||
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
|
||||
go httpserver.Serve(*httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
|
||||
switch r.URL.Path {
|
||||
case "/prometheus/api/v1/query":
|
||||
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
case "/prometheus/api/v1/write", "/api/v1/write":
|
||||
if err := promremotewrite.InsertHandler(r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
default:
|
||||
}
|
||||
return false
|
||||
})
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
readyCheckFunc := func() bool {
|
||||
resp, err := http.Get(testHealthHTTPPath)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
return resp.StatusCode == 200
|
||||
}
|
||||
checkCheck:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Fatalf("http server can't be ready in 30s")
|
||||
default:
|
||||
if readyCheckFunc() {
|
||||
break checkCheck
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tearDown() {
|
||||
if err := httpserver.Stop(*httpListenAddr); err != nil {
|
||||
logger.Errorf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
vmstorage.Stop()
|
||||
metrics.UnregisterAllMetrics()
|
||||
fs.MustRemoveAll(storagePath)
|
||||
}
|
||||
|
||||
// resolveAndGlobFilepaths joins all relative paths in a configuration
|
||||
// with a given base directory and replaces all globs with matching files.
|
||||
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
|
||||
for i, rf := range utf.RuleFiles {
|
||||
if rf != "" && !filepath.IsAbs(rf) {
|
||||
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
|
||||
}
|
||||
}
|
||||
|
||||
var globbedFiles []string
|
||||
for _, rf := range utf.RuleFiles {
|
||||
m, err := filepath.Glob(rf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(m) == 0 {
|
||||
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
|
||||
}
|
||||
globbedFiles = append(globbedFiles, m...)
|
||||
}
|
||||
utf.RuleFiles = globbedFiles
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
|
||||
// set up vmstorage and http server for ingest and read queries
|
||||
setUp()
|
||||
// tear down vmstorage and clean the data dir
|
||||
defer tearDown()
|
||||
|
||||
err := unittest.WriteInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
|
||||
if err != nil {
|
||||
return []error{err}
|
||||
}
|
||||
|
||||
q, err := datasource.Init(nil)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init datasource: %v", err)}
|
||||
}
|
||||
rw, err := remotewrite.NewDebugClient()
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init wr: %v", err)}
|
||||
}
|
||||
|
||||
alertEvalTimesMap := map[time.Duration]struct{}{}
|
||||
alertExpResultMap := map[time.Duration]map[string]map[string][]unittest.ExpAlert{}
|
||||
for _, at := range tg.AlertRuleTests {
|
||||
et := at.EvalTime.Duration()
|
||||
alertEvalTimesMap[et] = struct{}{}
|
||||
if _, ok := alertExpResultMap[et]; !ok {
|
||||
alertExpResultMap[et] = make(map[string]map[string][]unittest.ExpAlert)
|
||||
}
|
||||
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
|
||||
alertExpResultMap[et][at.GroupName] = make(map[string][]unittest.ExpAlert)
|
||||
}
|
||||
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
|
||||
}
|
||||
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
|
||||
for k := range alertEvalTimesMap {
|
||||
alertEvalTimes = append(alertEvalTimes, k)
|
||||
}
|
||||
sort.Slice(alertEvalTimes, func(i, j int) bool {
|
||||
return alertEvalTimes[i] < alertEvalTimes[j]
|
||||
})
|
||||
|
||||
// sort group eval order according to the given "group_eval_order".
|
||||
sort.Slice(testGroups, func(i, j int) bool {
|
||||
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
|
||||
})
|
||||
|
||||
// create groups with given rule
|
||||
var groups []*Group
|
||||
for _, group := range testGroups {
|
||||
ng := newGroup(group, q, *evaluationInterval, tg.ExternalLabels)
|
||||
groups = append(groups, ng)
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: func() []notifier.Notifier { return nil },
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
|
||||
evalIndex := 0
|
||||
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
|
||||
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
|
||||
for _, g := range groups {
|
||||
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
|
||||
errs := e.execConcurrently(context.Background(), g.Rules, ts, g.Concurrency, resolveDuration, g.Limit)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
|
||||
ts, err))
|
||||
}
|
||||
}
|
||||
// flush series after each group evaluation
|
||||
vmstorage.Storage.DebugFlush()
|
||||
}
|
||||
|
||||
// check alert_rule_test case at every eval time
|
||||
for evalIndex < len(alertEvalTimes) {
|
||||
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
|
||||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
|
||||
break
|
||||
}
|
||||
gotAlertsMap := map[string]map[string]unittest.LabelsAndAnnotations{}
|
||||
for _, g := range groups {
|
||||
if *disableAlertGroupLabel {
|
||||
g.Name = ""
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, ok := gotAlertsMap[g.Name]; !ok {
|
||||
gotAlertsMap[g.Name] = make(map[string]unittest.LabelsAndAnnotations)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, isAlertRule := rule.(*AlertingRule)
|
||||
if !isAlertRule {
|
||||
continue
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
|
||||
for _, got := range ar.alerts {
|
||||
if got.State != notifier.StateFiring {
|
||||
continue
|
||||
}
|
||||
laa := unittest.LabelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(got.Labels),
|
||||
Annotations: datasource.ConvertToLabels(got.Annotations),
|
||||
}
|
||||
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
|
||||
for alertname, res := range gres {
|
||||
var expAlerts unittest.LabelsAndAnnotations
|
||||
for _, expAlert := range res {
|
||||
if expAlert.ExpLabels == nil {
|
||||
expAlert.ExpLabels = make(map[string]string)
|
||||
}
|
||||
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
|
||||
if !*disableAlertGroupLabel {
|
||||
expAlert.ExpLabels[alertGroupNameLabel] = groupname
|
||||
}
|
||||
// alertNameLabel is added as additional labels in vmalert.
|
||||
expAlert.ExpLabels[alertNameLabel] = alertname
|
||||
expAlerts = append(expAlerts, unittest.LabelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
|
||||
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
|
||||
})
|
||||
}
|
||||
sort.Sort(expAlerts)
|
||||
|
||||
gotAlerts := gotAlertsMap[groupname][alertname]
|
||||
sort.Sort(gotAlerts)
|
||||
if !reflect.DeepEqual(expAlerts, gotAlerts) {
|
||||
var testGroupName string
|
||||
if tg.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
|
||||
}
|
||||
expString := unittest.IndentLines(expAlerts.String(), " ")
|
||||
gotString := unittest.IndentLines(gotAlerts.String(), " ")
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
|
||||
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
|
||||
}
|
||||
}
|
||||
}
|
||||
evalIndex++
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
checkErrs = append(checkErrs, unittest.CheckMetricsqlCase(tg.MetricsqlExprTests, q)...)
|
||||
return checkErrs
|
||||
}
|
||||
|
||||
// unitTestFile holds the contents of a single unit test file
|
||||
type unitTestFile struct {
|
||||
RuleFiles []string `yaml:"rule_files"`
|
||||
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
|
||||
GroupEvalOrder []string `yaml:"group_eval_order"`
|
||||
Tests []testGroup `yaml:"tests"`
|
||||
}
|
||||
|
||||
// testGroup is a group of input series and test cases associated with it
|
||||
type testGroup struct {
|
||||
Interval *promutils.Duration `yaml:"interval"`
|
||||
InputSeries []unittest.Series `yaml:"input_series"`
|
||||
AlertRuleTests []unittest.AlertTestCase `yaml:"alert_rule_test"`
|
||||
MetricsqlExprTests []unittest.MetricsqlTestCase `yaml:"metricsql_expr_test"`
|
||||
ExternalLabels map[string]string `yaml:"external_labels"`
|
||||
TestGroupName string `yaml:"name"`
|
||||
}
|
||||
|
||||
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
|
||||
func (tg *testGroup) maxEvalTime() time.Duration {
|
||||
var maxd time.Duration
|
||||
for _, alert := range tg.AlertRuleTests {
|
||||
if alert.EvalTime.Duration() > maxd {
|
||||
maxd = alert.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
for _, met := range tg.MetricsqlExprTests {
|
||||
if met.EvalTime.Duration() > maxd {
|
||||
maxd = met.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
return maxd
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// AlertTestCase holds alert_rule_test cases defined in test file
|
||||
type AlertTestCase struct {
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
GroupName string `yaml:"groupname"`
|
||||
Alertname string `yaml:"alertname"`
|
||||
ExpAlerts []ExpAlert `yaml:"exp_alerts"`
|
||||
}
|
||||
|
||||
// ExpAlert holds exp_alerts defined in test file
|
||||
type ExpAlert struct {
|
||||
ExpLabels map[string]string `yaml:"exp_labels"`
|
||||
ExpAnnotations map[string]string `yaml:"exp_annotations"`
|
||||
}
|
||||
@@ -1,182 +0,0 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// Series holds input_series defined in the test file
|
||||
type Series struct {
|
||||
Series string `yaml:"series"`
|
||||
Values string `yaml:"values"`
|
||||
}
|
||||
|
||||
// sequenceValue is an omittable value in a sequence of time series values.
|
||||
type sequenceValue struct {
|
||||
Value float64
|
||||
Omitted bool
|
||||
}
|
||||
|
||||
func httpWrite(address string, r io.Reader) {
|
||||
resp, err := http.Post(address, "", r)
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to send to storage: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// WriteInputSeries send input series to vmstorage and flush them
|
||||
func WriteInputSeries(input []Series, interval *promutils.Duration, startStamp time.Time, dst string) error {
|
||||
r := testutil.WriteRequest{}
|
||||
for _, data := range input {
|
||||
expr, err := metricsql.Parse(data.Series)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
|
||||
}
|
||||
promvals, err := parseInputValue(data.Values, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
|
||||
}
|
||||
metricExpr, ok := expr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
|
||||
}
|
||||
samples := make([]testutil.Sample, 0, len(promvals))
|
||||
ts := startStamp
|
||||
for _, v := range promvals {
|
||||
if !v.Omitted {
|
||||
samples = append(samples, testutil.Sample{
|
||||
Timestamp: ts.UnixMilli(),
|
||||
Value: v.Value,
|
||||
})
|
||||
}
|
||||
ts = ts.Add(interval.Duration())
|
||||
}
|
||||
var ls []testutil.Label
|
||||
for _, filter := range metricExpr.LabelFilterss[0] {
|
||||
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
|
||||
}
|
||||
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
|
||||
}
|
||||
|
||||
data, err := testutil.Compress(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compress data: %v", err)
|
||||
}
|
||||
// write input series to vm
|
||||
httpWrite(dst, bytes.NewBuffer(data))
|
||||
vmstorage.Storage.DebugFlush()
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
|
||||
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
|
||||
var res []sequenceValue
|
||||
items := strings.Split(input, " ")
|
||||
reg := regexp.MustCompile(`\D?\d*\D?`)
|
||||
for _, item := range items {
|
||||
if item == "stale" {
|
||||
res = append(res, sequenceValue{Value: decimal.StaleNaN})
|
||||
continue
|
||||
}
|
||||
vals := reg.FindAllString(item, -1)
|
||||
switch len(vals) {
|
||||
case 1:
|
||||
if vals[0] == "_" {
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v, err := strconv.ParseFloat(vals[0], 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v})
|
||||
continue
|
||||
case 2:
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v2, err := strconv.ParseInt(vals[1], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
switch option {
|
||||
case '+':
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v1 + float64(v2)})
|
||||
case 'x':
|
||||
for i := int64(0); i <= v2; i++ {
|
||||
if p1 == "_" {
|
||||
if i == 0 {
|
||||
i = 1
|
||||
}
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !origin || v1 == 0 {
|
||||
res = append(res, sequenceValue{Value: v1 * float64(i)})
|
||||
continue
|
||||
}
|
||||
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
|
||||
newRes, err := parseInputValue(newVal, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, newRes...)
|
||||
break
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("got invalid operation %b", option)
|
||||
}
|
||||
case 3:
|
||||
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
var isAdd bool
|
||||
if option == '+' {
|
||||
isAdd = true
|
||||
}
|
||||
for _, r := range r1 {
|
||||
if isAdd {
|
||||
res = append(res, sequenceValue{
|
||||
Value: r.Value + v1,
|
||||
})
|
||||
} else {
|
||||
res = append(res, sequenceValue{
|
||||
Value: v1 - r.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported input %s", input)
|
||||
}
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
@@ -1,93 +0,0 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
)
|
||||
|
||||
func TestParseInputValue(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
exp []sequenceValue
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
"",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"testfailed",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
// stale doesn't support operations
|
||||
{
|
||||
"stalex3",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"-4",
|
||||
[]sequenceValue{{Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"stale",
|
||||
[]sequenceValue{{Value: decimal.StaleNaN}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"-4x1",
|
||||
[]sequenceValue{{Value: -4}, {Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_x1",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x4",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"2-1x4",
|
||||
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x1 _ -4 stale 3+20x1",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
output, err := parseInputValue(tc.input, true)
|
||||
if err != nil != tc.failed {
|
||||
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
|
||||
}
|
||||
if len(tc.exp) != len(output) {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
for i := 0; i < len(tc.exp); i++ {
|
||||
if tc.exp[i].Omitted != output[i].Omitted {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
if tc.exp[i].Value != output[i].Value {
|
||||
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
|
||||
continue
|
||||
}
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,92 +0,0 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// MetricsqlTestCase holds metricsql_expr_test cases defined in test file
|
||||
type MetricsqlTestCase struct {
|
||||
Expr string `yaml:"expr"`
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
ExpSamples []expSample `yaml:"exp_samples"`
|
||||
}
|
||||
|
||||
type expSample struct {
|
||||
Labels string `yaml:"labels"`
|
||||
Value float64 `yaml:"value"`
|
||||
}
|
||||
|
||||
// CheckMetricsqlCase will check metricsql_expr_test cases
|
||||
func CheckMetricsqlCase(cases []MetricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
|
||||
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
|
||||
Outer:
|
||||
for _, mt := range cases {
|
||||
result, _, err := queries.Query(context.Background(), mt.Expr, mt.EvalTime.ParseTime())
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), err))
|
||||
continue
|
||||
}
|
||||
var gotSamples []parsedSample
|
||||
for _, s := range result.Data {
|
||||
sort.Slice(s.Labels, func(i, j int) bool {
|
||||
return s.Labels[i].Name < s.Labels[j].Name
|
||||
})
|
||||
gotSamples = append(gotSamples, parsedSample{
|
||||
Labels: s.Labels,
|
||||
Value: s.Values[0],
|
||||
})
|
||||
}
|
||||
var expSamples []parsedSample
|
||||
for _, s := range mt.ExpSamples {
|
||||
expLb := datasource.Labels{}
|
||||
if s.Labels != "" {
|
||||
metricsqlExpr, err := metricsql.Parse(s.Labels)
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
|
||||
continue Outer
|
||||
}
|
||||
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
|
||||
continue Outer
|
||||
}
|
||||
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
|
||||
expLb = append(expLb, datasource.Label{
|
||||
Name: l.Label,
|
||||
Value: l.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
sort.Slice(expLb, func(i, j int) bool {
|
||||
return expLb[i].Name < expLb[j].Name
|
||||
})
|
||||
expSamples = append(expSamples, parsedSample{
|
||||
Labels: expLb,
|
||||
Value: s.Value,
|
||||
})
|
||||
}
|
||||
sort.Slice(expSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
|
||||
})
|
||||
sort.Slice(gotSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
|
||||
})
|
||||
if !reflect.DeepEqual(expSamples, gotSamples) {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
|
||||
}
|
||||
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
41
app/vmalert/unittest/testdata/failed-test.yaml
vendored
41
app/vmalert/unittest/testdata/failed-test.yaml
vendored
@@ -1,41 +0,0 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "Failing test"
|
||||
input_series:
|
||||
- series: test
|
||||
values: "0"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
labels: test
|
||||
|
||||
# will failed cause there is no "Test" group and rule defined
|
||||
alert_rule_test:
|
||||
- eval_time: 0m
|
||||
groupname: Test
|
||||
alertname: Test
|
||||
exp_alerts:
|
||||
- exp_labels: {}
|
||||
|
||||
- interval: 1m
|
||||
name: Failing alert test
|
||||
input_series:
|
||||
- series: 'up{job="test"}'
|
||||
values: 0x10
|
||||
|
||||
alert_rule_test:
|
||||
# will failed cause rule is firing
|
||||
- eval_time: 5m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
# will failed cause missing groupname
|
||||
- eval_time: 5m
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts: []
|
||||
30
app/vmalert/unittest/testdata/long-period.yaml
vendored
30
app/vmalert/unittest/testdata/long-period.yaml
vendored
@@ -1,30 +0,0 @@
|
||||
# can be executed successfully but will take more than 1 minute
|
||||
# not included in unit test now
|
||||
evaluation_interval: 100d
|
||||
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1d
|
||||
input_series:
|
||||
- series: test
|
||||
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
|
||||
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
|
||||
# We input series to 2024-01-01T00:00:00 here.
|
||||
values: "0+1x19723"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: timestamp(test)
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
- expr: test
|
||||
eval_time: 100d
|
||||
exp_samples:
|
||||
- labels: test
|
||||
value: 100
|
||||
- expr: timestamp(test)
|
||||
eval_time: 19000d
|
||||
exp_samples:
|
||||
- value: 1641600000 # 19000d -> seconds.
|
||||
39
app/vmalert/unittest/testdata/rules.yaml
vendored
39
app/vmalert/unittest/testdata/rules.yaml
vendored
@@ -1,39 +0,0 @@
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 1m
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: t1
|
||||
expr: test
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 5m
|
||||
|
||||
- name: group3
|
||||
rules:
|
||||
- record: t2
|
||||
expr: t1
|
||||
|
||||
- name: group4
|
||||
rules:
|
||||
- record: t3
|
||||
expr: t1
|
||||
99
app/vmalert/unittest/testdata/test1.yaml
vendored
99
app/vmalert/unittest/testdata/test1.yaml
vendored
@@ -1,99 +0,0 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
group_eval_order: ["group4", "group2", "group3"]
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "basic test"
|
||||
input_series:
|
||||
- series: "test"
|
||||
values: "_x5 1x5 _ stale"
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 1m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts:
|
||||
- {}
|
||||
- eval_time: 1m
|
||||
groupname: group2
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
- eval_time: 6m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 11m
|
||||
exp_samples:
|
||||
- labels: '{__name__="test"}'
|
||||
value: 1
|
||||
- expr: test
|
||||
eval_time: 12m
|
||||
exp_samples: []
|
||||
|
||||
- interval: 1m
|
||||
name: "basic test2"
|
||||
input_series:
|
||||
- series: 'up{job="vmagent1", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
- series: "test"
|
||||
values: "0+1x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
|
||||
value: 1
|
||||
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
|
||||
value: 1
|
||||
- expr: t1
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t1", datacenter="dc-123"}'
|
||||
- expr: t2
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t2", datacenter="dc-123"}'
|
||||
- expr: t3
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
|
||||
- value: 3
|
||||
labels: '{__name__="t3", datacenter="dc-123"}'
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent1
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: alerts
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
46
app/vmalert/unittest/testdata/test2.yaml
vendored
46
app/vmalert/unittest/testdata/test2.yaml
vendored
@@ -1,46 +0,0 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
@@ -1,83 +0,0 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
// parsedSample is a sample with parsed Labels
|
||||
type parsedSample struct {
|
||||
Labels datasource.Labels
|
||||
Value float64
|
||||
}
|
||||
|
||||
func (ps *parsedSample) String() string {
|
||||
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
|
||||
}
|
||||
|
||||
func parsedSamplesString(pss []parsedSample) string {
|
||||
if len(pss) == 0 {
|
||||
return "nil"
|
||||
}
|
||||
s := pss[0].String()
|
||||
for _, ps := range pss[1:] {
|
||||
s += ", " + ps.String()
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// LabelAndAnnotation holds labels and annotations
|
||||
type LabelAndAnnotation struct {
|
||||
Labels datasource.Labels
|
||||
Annotations datasource.Labels
|
||||
}
|
||||
|
||||
func (la *LabelAndAnnotation) String() string {
|
||||
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
|
||||
}
|
||||
|
||||
// LabelsAndAnnotations is collection of LabelAndAnnotation
|
||||
type LabelsAndAnnotations []LabelAndAnnotation
|
||||
|
||||
func (la LabelsAndAnnotations) Len() int { return len(la) }
|
||||
|
||||
func (la LabelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
|
||||
func (la LabelsAndAnnotations) Less(i, j int) bool {
|
||||
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
|
||||
if diff != 0 {
|
||||
return diff < 0
|
||||
}
|
||||
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
|
||||
}
|
||||
|
||||
func (la LabelsAndAnnotations) String() string {
|
||||
if len(la) == 0 {
|
||||
return "[]"
|
||||
}
|
||||
s := "[\n0:" + IndentLines("\n"+la[0].String(), " ")
|
||||
for i, l := range la[1:] {
|
||||
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + IndentLines("\n"+l.String(), " ")
|
||||
}
|
||||
s += "\n]"
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// IndentLines prefixes each line in the supplied string with the given "indent" string.
|
||||
func IndentLines(lines, indent string) string {
|
||||
sb := strings.Builder{}
|
||||
n := strings.Split(lines, "\n")
|
||||
for i, l := range n {
|
||||
if i > 0 {
|
||||
sb.WriteString(indent)
|
||||
}
|
||||
sb.WriteString(l)
|
||||
if i != len(n)-1 {
|
||||
sb.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
@@ -138,27 +138,8 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return true
|
||||
|
||||
default:
|
||||
// Support of deprecated links:
|
||||
// * /api/v1/<groupID>/<alertID>/status
|
||||
// * <groupID>/<alertID>/status
|
||||
// TODO: to remove in next versions
|
||||
|
||||
if !strings.HasSuffix(r.URL.Path, "/status") {
|
||||
httpserver.Errorf(w, r, "unsupported path requested: %q ", r.URL.Path)
|
||||
return false
|
||||
}
|
||||
alert, err := rh.alertByPath(strings.TrimPrefix(r.URL.Path, "/api/v1/"))
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
|
||||
redirectURL := alert.WebLink()
|
||||
if strings.HasPrefix(r.URL.Path, "/api/v1/") {
|
||||
redirectURL = alert.APILink()
|
||||
}
|
||||
httpserver.Redirect(w, "/"+redirectURL)
|
||||
return true
|
||||
httpserver.Errorf(w, r, "unsupported path requested: %q ", r.URL.Path)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,41 +283,6 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (rh *requestHandler) alertByPath(path string) (*APIAlert, error) {
|
||||
if strings.HasPrefix(path, "/vmalert") {
|
||||
path = strings.TrimLeft(path, "/vmalert")
|
||||
}
|
||||
parts := strings.SplitN(strings.TrimLeft(path, "/"), "/", -1)
|
||||
if len(parts) != 3 {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`path %q cointains /status suffix but doesn't match pattern "/groupID/alertID/status"`, path),
|
||||
StatusCode: http.StatusBadRequest,
|
||||
}
|
||||
}
|
||||
groupID, err := uint64FromPath(parts[0])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
|
||||
}
|
||||
alertID, err := uint64FromPath(parts[1])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
|
||||
}
|
||||
resp, err := rh.m.AlertAPI(groupID, alertID)
|
||||
if err != nil {
|
||||
return nil, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func uint64FromPath(path string) (uint64, error) {
|
||||
s := strings.TrimRight(path, "/")
|
||||
return strconv.ParseUint(s, 10, 0)
|
||||
}
|
||||
|
||||
func badRequest(err error) *httpserver.ErrorWithStatusCode {
|
||||
return errResponse(err, http.StatusBadRequest)
|
||||
}
|
||||
|
||||
func errResponse(err error, sc int) *httpserver.ErrorWithStatusCode {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: err,
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
|
||||
{% func Welcome(r *http.Request) %}
|
||||
{%= tpl.Header(r, navItems, "vmalert", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}
|
||||
<p>
|
||||
API:<br>
|
||||
{% for _, p := range apiLinks %}
|
||||
@@ -40,7 +40,7 @@ btn-primary
|
||||
|
||||
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Groups", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
|
||||
{%code
|
||||
filter := r.URL.Query().Get("filter")
|
||||
rOk := make(map[string]int)
|
||||
@@ -168,7 +168,7 @@ btn-primary
|
||||
|
||||
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Alerts", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
|
||||
{% if len(groupAlerts) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||
@@ -255,7 +255,7 @@ btn-primary
|
||||
{% endfunc %}
|
||||
|
||||
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
|
||||
{%= tpl.Header(r, navItems, "Notifiers", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %}
|
||||
{% if len(targets) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||
@@ -312,7 +312,7 @@ btn-primary
|
||||
|
||||
{% func Alert(r *http.Request, alert *APIAlert) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
for k := range alert.Labels {
|
||||
@@ -399,7 +399,7 @@ btn-primary
|
||||
|
||||
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
for k := range rule.Labels {
|
||||
|
||||
@@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) {
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:15
|
||||
tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:15
|
||||
qw422016.N().S(`
|
||||
<p>
|
||||
@@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:43
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Groups", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:43
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
@@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:171
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:171
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
@@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:258
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:258
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
@@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:315
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:315
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
@@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:402
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:402
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
||||
@@ -145,23 +145,6 @@ func TestHandler(t *testing.T) {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
})
|
||||
|
||||
// check deprecated links support
|
||||
// TODO: remove as soon as deprecated links removed
|
||||
t.Run("/api/v1/0/0/status", func(t *testing.T) {
|
||||
alert := &APIAlert{}
|
||||
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/1/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/0/1/status", nil, 404)
|
||||
})
|
||||
t.Run("/api/v1/1/0/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
|
||||
})
|
||||
}
|
||||
|
||||
func TestEmptyResponse(t *testing.T) {
|
||||
|
||||
@@ -351,6 +351,8 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
-failTimeout duration
|
||||
Sets a delay period for load balancing to skip a malfunctioning backend. (defaults 3s)
|
||||
-flagsAuthKey string
|
||||
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-fs.disableMmap
|
||||
|
||||
@@ -134,7 +134,7 @@ func (bu *backendURL) isBroken() bool {
|
||||
}
|
||||
|
||||
func (bu *backendURL) setBroken() {
|
||||
deadline := fasttime.UnixTimestamp() + 3
|
||||
deadline := fasttime.UnixTimestamp() + uint64((*failTimeout).Seconds())
|
||||
atomic.StoreUint64(&bu.brokenDeadline, deadline)
|
||||
}
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@ var (
|
||||
reloadAuthKey = flag.String("reloadAuthKey", "", "Auth key for /-/reload http endpoint. It must be passed as authKey=...")
|
||||
logInvalidAuthTokens = flag.Bool("logInvalidAuthTokens", false, "Whether to log requests with invalid auth tokens. "+
|
||||
`Such requests are always counted at vmauth_http_request_errors_total{reason="invalid_auth_token"} metric, which is exposed at /metrics page`)
|
||||
failTimeout = flag.Duration("failTimeout", 3*time.Second, "Sets a delay period for load balancing to skip a malfunctioning backend.")
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
@@ -325,8 +325,9 @@ const (
|
||||
vmNativeFilterTimeEnd = "vm-native-filter-time-end"
|
||||
vmNativeStepInterval = "vm-native-step-interval"
|
||||
|
||||
vmNativeDisableHTTPKeepAlive = "vm-native-disable-http-keep-alive"
|
||||
vmNativeDisableRetries = "vm-native-disable-retries"
|
||||
vmNativeDisableBinaryProtocol = "vm-native-disable-binary-protocol"
|
||||
vmNativeDisableHTTPKeepAlive = "vm-native-disable-http-keep-alive"
|
||||
vmNativeDisableRetries = "vm-native-disable-retries"
|
||||
|
||||
vmNativeSrcAddr = "vm-native-src-addr"
|
||||
vmNativeSrcUser = "vm-native-src-user"
|
||||
@@ -450,6 +451,14 @@ var (
|
||||
Usage: "Defines whether to disable retries with backoff policy for migration process",
|
||||
Value: false,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: vmNativeDisableBinaryProtocol,
|
||||
Usage: "Whether to use https://docs.victoriametrics.com/#how-to-export-data-in-json-line-format" +
|
||||
"instead of https://docs.victoriametrics.com/#how-to-export-data-in-native-format API." +
|
||||
"Binary export/import API protocol implies less network and resource usage, as it transfers compressed binary data blocks." +
|
||||
"Non-binary export/import API is less efficient, but supports deduplication if it is configured on vm-native-src-addr side.",
|
||||
Value: false,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -468,6 +477,7 @@ const (
|
||||
remoteReadHTTPTimeout = "remote-read-http-timeout"
|
||||
remoteReadHeaders = "remote-read-headers"
|
||||
remoteReadInsecureSkipVerify = "remote-read-insecure-skip-verify"
|
||||
remoteReadDisablePathAppend = "remote-read-disable-path-append"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -544,6 +554,11 @@ var (
|
||||
Usage: "Whether to skip TLS certificate verification when connecting to the remote read address",
|
||||
Value: false,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: remoteReadDisablePathAppend,
|
||||
Usage: "Whether to disable automatic appending of the path to the remote storage.",
|
||||
Value: true,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -133,6 +133,7 @@ func main() {
|
||||
LabelName: c.String(remoteReadFilterLabel),
|
||||
LabelValue: c.String(remoteReadFilterLabelValue),
|
||||
InsecureSkipVerify: c.Bool(remoteReadInsecureSkipVerify),
|
||||
DisablePathAppend: c.Bool(remoteReadDisablePathAppend),
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("error create remote read client: %s", err)
|
||||
@@ -254,6 +255,7 @@ func main() {
|
||||
cc: c.Int(vmConcurrency),
|
||||
disableRetries: c.Bool(vmNativeDisableRetries),
|
||||
isSilent: c.Bool(globalSilent),
|
||||
isNative: !c.Bool(vmNativeDisableBinaryProtocol),
|
||||
}
|
||||
return p.run(ctx)
|
||||
},
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -23,7 +24,6 @@ import (
|
||||
const (
|
||||
defaultReadTimeout = 5 * time.Minute
|
||||
remoteReadPath = "/api/v1/read"
|
||||
healthPath = "/-/healthy"
|
||||
)
|
||||
|
||||
// StreamCallback is a callback function for processing time series
|
||||
@@ -32,19 +32,22 @@ type StreamCallback func(series *vm.TimeSeries) error
|
||||
// Client is an HTTP client for reading
|
||||
// time series via remote read protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
user string
|
||||
password string
|
||||
useStream bool
|
||||
headers []keyValue
|
||||
matchers []*prompb.LabelMatcher
|
||||
addr string
|
||||
disablePathAppend bool
|
||||
c *http.Client
|
||||
user string
|
||||
password string
|
||||
useStream bool
|
||||
headers []keyValue
|
||||
matchers []*prompb.LabelMatcher
|
||||
}
|
||||
|
||||
// Config is config for remote read.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
// DisablePathAppend disable automatic appending of the remote read path
|
||||
DisablePathAppend bool
|
||||
// Timeout defines timeout for HTTP requests
|
||||
// made by remote read client
|
||||
Timeout time.Duration
|
||||
@@ -105,13 +108,15 @@ func NewClient(cfg Config) (*Client, error) {
|
||||
Timeout: cfg.Timeout,
|
||||
Transport: utils.Transport(cfg.Addr, cfg.InsecureSkipVerify),
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
user: cfg.Username,
|
||||
password: cfg.Password,
|
||||
useStream: cfg.UseStream,
|
||||
headers: headers,
|
||||
matchers: []*prompb.LabelMatcher{m},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
disablePathAppend: cfg.DisablePathAppend,
|
||||
user: cfg.Username,
|
||||
password: cfg.Password,
|
||||
useStream: cfg.UseStream,
|
||||
headers: headers,
|
||||
matchers: []*prompb.LabelMatcher{m},
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
@@ -154,27 +159,18 @@ func (c *Client) do(req *http.Request) (*http.Response, error) {
|
||||
return c.c.Do(req)
|
||||
}
|
||||
|
||||
// Ping checks the health of the read source
|
||||
func (c *Client) Ping() error {
|
||||
url := c.addr + healthPath
|
||||
req, err := http.NewRequest(http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot create request to %q: %s", url, err)
|
||||
}
|
||||
resp, err := c.do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("bad status code: %d", resp.StatusCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) fetch(ctx context.Context, data []byte, streamCb StreamCallback) error {
|
||||
r := bytes.NewReader(data)
|
||||
url := c.addr + remoteReadPath
|
||||
req, err := http.NewRequest(http.MethodPost, url, r)
|
||||
// by default, we are using a common remote read path
|
||||
u, err := url.JoinPath(c.addr, remoteReadPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error create url from addr %s and default remote read path %s", c.addr, remoteReadPath)
|
||||
}
|
||||
// we should use full address from the remote-read-src-addr flag
|
||||
if c.disablePathAppend {
|
||||
u = c.addr
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodPost, u, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,8 @@ const (
|
||||
StepMonth string = "month"
|
||||
// StepDay represents a one day interval
|
||||
StepDay string = "day"
|
||||
// StepWeek represents a one week interval
|
||||
StepWeek string = "week"
|
||||
// StepHour represents a one hour interval
|
||||
StepHour string = "hour"
|
||||
// StepMinute represents a one minute interval
|
||||
@@ -40,6 +42,10 @@ func SplitDateRange(start, end time.Time, step string) ([][]time.Time, error) {
|
||||
nextStep = func(t time.Time) (time.Time, time.Time) {
|
||||
return t, t.AddDate(0, 0, 1)
|
||||
}
|
||||
case StepWeek:
|
||||
nextStep = func(t time.Time) (time.Time, time.Time) {
|
||||
return t, t.Add(7 * 24 * time.Hour)
|
||||
}
|
||||
case StepHour:
|
||||
nextStep = func(t time.Time) (time.Time, time.Time) {
|
||||
return t, t.Add(time.Hour * 1)
|
||||
|
||||
@@ -170,6 +170,82 @@ func Test_splitDateRange(t *testing.T) {
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "week chunking with not full week",
|
||||
args: args{
|
||||
start: "2023-07-30T00:00:00Z",
|
||||
end: "2023-08-05T23:59:59.999999999Z",
|
||||
granularity: StepWeek,
|
||||
},
|
||||
want: []testTimeRange{
|
||||
{
|
||||
"2023-07-30T00:00:00Z",
|
||||
"2023-08-05T23:59:59.999999999Z",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "week chunking with start of the week and end of the week",
|
||||
args: args{
|
||||
start: "2023-07-30T00:00:00Z",
|
||||
end: "2023-08-06T00:00:00Z",
|
||||
granularity: StepWeek,
|
||||
},
|
||||
want: []testTimeRange{
|
||||
{
|
||||
"2023-07-30T00:00:00Z",
|
||||
"2023-08-06T00:00:00Z",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "week chunking with next one day week",
|
||||
args: args{
|
||||
start: "2023-07-30T00:00:00Z",
|
||||
end: "2023-08-07T01:12:00Z",
|
||||
granularity: StepWeek,
|
||||
},
|
||||
want: []testTimeRange{
|
||||
{
|
||||
"2023-07-30T00:00:00Z",
|
||||
"2023-08-06T00:00:00Z",
|
||||
},
|
||||
{
|
||||
"2023-08-06T00:00:00Z",
|
||||
"2023-08-07T01:12:00Z",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "week chunking with month and not full week representation",
|
||||
args: args{
|
||||
start: "2023-07-30T00:00:00Z",
|
||||
end: "2023-09-01T01:12:00Z",
|
||||
granularity: StepWeek,
|
||||
},
|
||||
want: []testTimeRange{
|
||||
{
|
||||
"2023-07-30T00:00:00Z",
|
||||
"2023-08-06T00:00:00Z",
|
||||
},
|
||||
{
|
||||
"2023-08-06T00:00:00Z",
|
||||
"2023-08-13T00:00:00Z",
|
||||
},
|
||||
{
|
||||
"2023-08-13T00:00:00Z",
|
||||
"2023-08-20T00:00:00Z",
|
||||
},
|
||||
{
|
||||
"2023-08-20T00:00:00Z",
|
||||
"2023-08-27T00:00:00Z",
|
||||
},
|
||||
{
|
||||
"2023-08-27T00:00:00Z",
|
||||
"2023-09-01T01:12:00Z",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
|
||||
@@ -34,11 +34,12 @@ type vmNativeProcessor struct {
|
||||
cc int
|
||||
disableRetries bool
|
||||
isSilent bool
|
||||
isNative bool
|
||||
}
|
||||
|
||||
const (
|
||||
nativeExportAddr = "api/v1/export/native"
|
||||
nativeImportAddr = "api/v1/import/native"
|
||||
nativeExportAddr = "api/v1/export"
|
||||
nativeImportAddr = "api/v1/import"
|
||||
nativeWithBackoffTpl = `{{ blue "%s:" }} {{ counters . }} {{ bar . "[" "█" (cycle . "█") "▒" "]" }} {{ percent . }}`
|
||||
nativeSingleProcessTpl = `Total: {{counters . }} {{ cycle . "↖" "↗" "↘" "↙" }} Speed: {{speed . }} {{string . "suffix"}}`
|
||||
)
|
||||
@@ -159,9 +160,14 @@ func (p *vmNativeProcessor) runSingle(ctx context.Context, f native.Filter, srcU
|
||||
|
||||
func (p *vmNativeProcessor) runBackfilling(ctx context.Context, tenantID string, ranges [][]time.Time, silent bool) error {
|
||||
exportAddr := nativeExportAddr
|
||||
importAddr := nativeImportAddr
|
||||
if p.isNative {
|
||||
exportAddr += "/native"
|
||||
importAddr += "/native"
|
||||
}
|
||||
srcURL := fmt.Sprintf("%s/%s", p.src.Addr, exportAddr)
|
||||
|
||||
importAddr, err := vm.AddExtraLabelsToImportPath(nativeImportAddr, p.dst.ExtraLabels)
|
||||
importAddr, err := vm.AddExtraLabelsToImportPath(importAddr, p.dst.ExtraLabels)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to add labels to import path: %s", err)
|
||||
}
|
||||
|
||||
@@ -228,6 +228,7 @@ func Test_vmNativeProcessor_run(t *testing.T) {
|
||||
interCluster: tt.fields.interCluster,
|
||||
cc: tt.fields.cc,
|
||||
isSilent: tt.args.silent,
|
||||
isNative: true,
|
||||
}
|
||||
|
||||
if err := p.run(tt.args.ctx); (err != nil) != tt.wantErr {
|
||||
|
||||
@@ -33,7 +33,7 @@ var (
|
||||
)
|
||||
|
||||
var (
|
||||
saCfgReloaderStopCh chan struct{}
|
||||
saCfgReloaderStopCh = make(chan struct{})
|
||||
saCfgReloaderWG sync.WaitGroup
|
||||
|
||||
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
|
||||
@@ -62,7 +62,6 @@ func CheckStreamAggrConfig() error {
|
||||
//
|
||||
// MustStopStreamAggr must be called when stream aggr is no longer needed.
|
||||
func InitStreamAggr() {
|
||||
saCfgReloaderStopCh = make(chan struct{})
|
||||
if *streamAggrConfig == "" {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -5826,6 +5826,17 @@ func TestExecSuccess(t *testing.T) {
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`share_eq_over_time`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `share_eq_over_time(round(5*rand(0))[200s:10s], 1)`
|
||||
r := netstorage.Result{
|
||||
MetricName: metricNameExpected,
|
||||
Values: []float64{0.1, 0.2, 0.25, 0.1, 0.3, 0.3},
|
||||
Timestamps: timestampsExpected,
|
||||
}
|
||||
resultExpected := []netstorage.Result{r}
|
||||
f(q, resultExpected)
|
||||
})
|
||||
t.Run(`count_gt_over_time`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
q := `count_gt_over_time(rand(0)[200s:10s], 0.7)`
|
||||
|
||||
@@ -78,6 +78,7 @@ var rollupFuncs = map[string]newRollupFunc{
|
||||
"scrape_interval": newRollupFuncOneArg(rollupScrapeInterval),
|
||||
"share_gt_over_time": newRollupShareGT,
|
||||
"share_le_over_time": newRollupShareLE,
|
||||
"share_eq_over_time": newRollupShareEQ,
|
||||
"stale_samples_over_time": newRollupFuncOneArg(rollupStaleSamples),
|
||||
"stddev_over_time": newRollupFuncOneArg(rollupStddev),
|
||||
"stdvar_over_time": newRollupFuncOneArg(rollupStdvar),
|
||||
@@ -1106,6 +1107,10 @@ func countFilterGT(values []float64, gt float64) int {
|
||||
return n
|
||||
}
|
||||
|
||||
func newRollupShareEQ(args []interface{}) (rollupFunc, error) {
|
||||
return newRollupShareFilter(args, countFilterEQ)
|
||||
}
|
||||
|
||||
func countFilterEQ(values []float64, eq float64) int {
|
||||
n := 0
|
||||
for _, v := range values {
|
||||
|
||||
@@ -261,6 +261,25 @@ func TestRollupShareGTOverTime(t *testing.T) {
|
||||
f(1000, 0)
|
||||
}
|
||||
|
||||
func TestRollupShareEQOverTime(t *testing.T) {
|
||||
f := func(eq, vExpected float64) {
|
||||
t.Helper()
|
||||
eqs := []*timeseries{{
|
||||
Values: []float64{eq},
|
||||
Timestamps: []int64{123},
|
||||
}}
|
||||
var me metricsql.MetricExpr
|
||||
args := []interface{}{&metricsql.RollupExpr{Expr: &me}, eqs}
|
||||
testRollupFunc(t, "share_eq_over_time", args, &me, vExpected)
|
||||
}
|
||||
|
||||
f(-123, 0)
|
||||
f(34, 0.3333333333333333)
|
||||
f(44, 0.16666666666666666)
|
||||
f(123, 0.08333333333333333)
|
||||
f(1000, 0)
|
||||
}
|
||||
|
||||
func TestRollupCountLEOverTime(t *testing.T) {
|
||||
f := func(le, vExpected float64) {
|
||||
t.Helper()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM golang:1.20.6 as build-web-stage
|
||||
FROM golang:1.20.7 as build-web-stage
|
||||
COPY build /build
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
@@ -24,6 +24,7 @@ export interface TracingData {
|
||||
export interface QueryStats {
|
||||
seriesFetched?: string;
|
||||
resultLength?: number;
|
||||
isPartial?: boolean;
|
||||
}
|
||||
|
||||
export interface Logs {
|
||||
|
||||
@@ -7,6 +7,7 @@ import "./style.scss";
|
||||
import { QueryStats } from "../../../api/types";
|
||||
import Tooltip from "../../Main/Tooltip/Tooltip";
|
||||
import { WarningIcon } from "../../Main/Icons";
|
||||
import { partialWarning, seriesFetchedWarning } from "./warningText";
|
||||
|
||||
export interface QueryEditorProps {
|
||||
onChange: (query: string) => void;
|
||||
@@ -39,7 +40,17 @@ const QueryEditor: FC<QueryEditorProps> = ({
|
||||
|
||||
const [openAutocomplete, setOpenAutocomplete] = useState(false);
|
||||
const autocompleteAnchorEl = useRef<HTMLDivElement>(null);
|
||||
const showSeriesFetchedWarning = stats?.seriesFetched === "0" && !stats.resultLength;
|
||||
|
||||
const warnings = [
|
||||
{
|
||||
show: stats?.seriesFetched === "0" && !stats.resultLength,
|
||||
text: seriesFetchedWarning
|
||||
},
|
||||
{
|
||||
show: stats?.isPartial,
|
||||
text: partialWarning
|
||||
}
|
||||
].filter((warning) => warning.show);
|
||||
|
||||
const handleSelect = (val: string) => {
|
||||
onChange(val);
|
||||
@@ -108,17 +119,14 @@ const QueryEditor: FC<QueryEditorProps> = ({
|
||||
onFoundOptions={handleChangeFoundOptions}
|
||||
/>
|
||||
)}
|
||||
{showSeriesFetchedWarning && (
|
||||
{!!warnings.length && (
|
||||
<div className="vm-query-editor-warning">
|
||||
<Tooltip
|
||||
placement="bottom-right"
|
||||
title={(
|
||||
<span className="vm-query-editor-warning__tooltip">
|
||||
{`No match!
|
||||
This query hasn't selected any time series from database.
|
||||
Either the requested metrics are missing in the database,
|
||||
or there is a typo in series selector.`}
|
||||
</span>
|
||||
<div className="vm-query-editor-warning__tooltip">
|
||||
{warnings.map((warning, index) => <p key={index}>{warning.text}</p>)}
|
||||
</div>
|
||||
)}
|
||||
>
|
||||
<WarningIcon/>
|
||||
|
||||
@@ -22,6 +22,14 @@
|
||||
|
||||
&__tooltip {
|
||||
white-space: pre-line;
|
||||
|
||||
p {
|
||||
margin-bottom: $padding-small;
|
||||
|
||||
&:last-child {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
export const seriesFetchedWarning = `No match!
|
||||
This query hasn't selected any time series from database.
|
||||
Either the requested metrics are missing in the database,
|
||||
or there is a typo in series selector.`;
|
||||
|
||||
export const partialWarning = `The shown results are marked as PARTIAL.
|
||||
The result is marked as partial if one or more vmstorage nodes failed to respond to the query.`;
|
||||
@@ -103,6 +103,7 @@ export const useFetchQuery = ({
|
||||
if (response.ok) {
|
||||
setQueryStats(prev => [...prev, {
|
||||
...resp?.stats,
|
||||
isPartial: resp?.isPartial,
|
||||
resultLength: resp.data.result.length,
|
||||
}]);
|
||||
setQueryErrors(prev => [...prev, ""]);
|
||||
|
||||
@@ -98,7 +98,7 @@
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "sum(vl_rows)",
|
||||
"expr": "sum(vl_storage_rows)",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
@@ -776,4 +776,4 @@
|
||||
"uid": "OqPIZTX4z",
|
||||
"version": 4,
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -624,7 +624,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "sum(vm_data_size_bytes{job=~\"$job_storage\"}) / sum(vm_rows{job=~\"$job_storage\"})",
|
||||
"expr": "sum(vm_data_size_bytes{job=~\"$job_storage\"}) / sum(vm_rows{job=~\"$job_storage\", type!~\"indexdb.*\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
|
||||
@@ -4434,7 +4434,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) \n/ sum(vm_rows{job=~\"$job\", instance=~\"$instance\"})",
|
||||
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) \n/ sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "9.2.6"
|
||||
"version": "9.2.7"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
@@ -182,7 +182,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -249,7 +249,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -310,7 +310,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -379,7 +379,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -451,7 +451,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -515,7 +515,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -606,7 +606,7 @@
|
||||
},
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -2372,8 +2372,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2475,8 +2474,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2581,8 +2579,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "transparent",
|
||||
"value": null
|
||||
"color": "transparent"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2687,8 +2684,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "transparent",
|
||||
"value": null
|
||||
"color": "transparent"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2792,8 +2788,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2898,8 +2893,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -3009,8 +3003,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -3113,8 +3106,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -3188,8 +3180,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4054,7 +4045,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4070,7 +4062,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 46
|
||||
"y": 38
|
||||
},
|
||||
"id": 73,
|
||||
"links": [],
|
||||
@@ -4170,7 +4162,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4186,7 +4179,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 46
|
||||
"y": 38
|
||||
},
|
||||
"id": 77,
|
||||
"links": [],
|
||||
@@ -4233,6 +4226,123 @@
|
||||
],
|
||||
"title": "Error rate ($instance)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"description": "Shows how many concurrent inserts are taking place.\n\nIf the number of concurrent inserts hitting the `limit` or is close to the `limit` constantly - it might be a sign of a resource shortage.\n\n If vmagent's CPU usage and remote write connection saturation are at normal level, it might be that `-maxConcurrentInserts` cmd-line flag need to be increased.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"links": [],
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 46
|
||||
},
|
||||
"id": 130,
|
||||
"links": [],
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"lastNotNull",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": true,
|
||||
"expr": "max_over_time(vm_concurrent_insert_current{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{instance}} ({{job}})",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": true,
|
||||
"expr": "min(vm_concurrent_insert_capacity{job=~\"$job\", instance=~\"$instance\"}) by(job)",
|
||||
"interval": "",
|
||||
"legendFormat": "limit ({{job}})",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Concurrent inserts ($instance)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
@@ -4310,7 +4420,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4326,7 +4437,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 47
|
||||
"y": 55
|
||||
},
|
||||
"id": 60,
|
||||
"options": {
|
||||
@@ -4412,7 +4523,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4428,7 +4540,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 47
|
||||
"y": 55
|
||||
},
|
||||
"id": 66,
|
||||
"options": {
|
||||
@@ -4514,7 +4626,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4530,7 +4643,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 55
|
||||
"y": 63
|
||||
},
|
||||
"id": 61,
|
||||
"options": {
|
||||
@@ -4616,7 +4729,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4632,7 +4746,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 55
|
||||
"y": 63
|
||||
},
|
||||
"id": 65,
|
||||
"options": {
|
||||
@@ -4717,7 +4831,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "transparent"
|
||||
"color": "transparent",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4733,7 +4848,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 63
|
||||
"y": 71
|
||||
},
|
||||
"id": 88,
|
||||
"options": {
|
||||
@@ -4815,7 +4930,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "transparent"
|
||||
"color": "transparent",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4831,7 +4947,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 63
|
||||
"y": 71
|
||||
},
|
||||
"id": 84,
|
||||
"options": {
|
||||
@@ -4916,7 +5032,8 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "transparent"
|
||||
"color": "transparent",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -4932,7 +5049,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 71
|
||||
"y": 79
|
||||
},
|
||||
"id": 90,
|
||||
"options": {
|
||||
@@ -5605,8 +5722,8 @@
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "VictoriaMetrics",
|
||||
"value": "VictoriaMetrics"
|
||||
"text": "VictoriaMetrics - cluster",
|
||||
"value": "VictoriaMetrics - cluster"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "9.2.6"
|
||||
"version": "9.2.7"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
@@ -204,7 +204,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -264,7 +264,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -324,7 +324,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -388,7 +388,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -452,7 +452,7 @@
|
||||
"text": {},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -546,7 +546,7 @@
|
||||
},
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -1182,7 +1182,7 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"pluginVersion": "9.2.7",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
@@ -1243,6 +1243,230 @@
|
||||
},
|
||||
"id": 43,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"description": "The precentage of used RSS memory\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"links": [],
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 33
|
||||
},
|
||||
"id": 37,
|
||||
"links": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Profiling",
|
||||
"url": "https://docs.victoriametrics.com/vmagent.html#profiling"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"lastNotNull",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
|
||||
"interval": "",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Memory usage % ($instance)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"description": "Amount of used RSS memory\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"links": [],
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 33
|
||||
},
|
||||
"id": 57,
|
||||
"links": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Profiling",
|
||||
"url": "https://docs.victoriametrics.com/vmagent.html#profiling"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"lastNotNull",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n) by(job)",
|
||||
"interval": "",
|
||||
"legendFormat": "{{job}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Memory usage ($instance)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@@ -1308,7 +1532,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 35
|
||||
"y": 41
|
||||
},
|
||||
"id": 35,
|
||||
"links": [
|
||||
@@ -1362,7 +1586,7 @@
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"description": "Amount of used memory\n\nResident memory shows share which can be freed by OS when needed.\n\nAnonymous shows share for memory allocated by the process itself. This share cannot be freed by the OS, so it must be taken into account by OOM killer.\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.",
|
||||
"description": "Shows the max number of CPU cores used by a `job` and the corresponding limit.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@@ -1414,7 +1638,7 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
@@ -1422,9 +1646,9 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 35
|
||||
"y": 41
|
||||
},
|
||||
"id": 37,
|
||||
"id": 56,
|
||||
"links": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
@@ -1447,7 +1671,7 @@
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "9.2.6",
|
||||
@@ -1459,14 +1683,32 @@
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
|
||||
"expr": "max(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job)",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"legendFormat": "__auto",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{job}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "min(process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}) by(job)",
|
||||
"format": "time_series",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "limit ({{job}})",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Memory usage % ($instance)",
|
||||
"title": "CPU usage ($instance)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
@@ -1535,7 +1777,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 43
|
||||
"y": 49
|
||||
},
|
||||
"id": 39,
|
||||
"links": [],
|
||||
@@ -1641,7 +1883,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 43
|
||||
"y": 49
|
||||
},
|
||||
"id": 41,
|
||||
"links": [],
|
||||
@@ -1754,8 +1996,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -1857,8 +2098,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -1960,8 +2200,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2064,8 +2303,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2164,8 +2402,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2292,8 +2529,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2395,8 +2631,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2497,8 +2732,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2620,8 +2854,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2713,8 +2946,7 @@
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
@@ -2776,9 +3008,9 @@
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "VictoriaMetrics",
|
||||
"value": "VictoriaMetrics"
|
||||
"selected": false,
|
||||
"text": "VictoriaMetrics - cluster",
|
||||
"value": "VictoriaMetrics - cluster"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
@@ -2862,7 +3094,7 @@
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"selected": false,
|
||||
"text": "5",
|
||||
"value": "5"
|
||||
},
|
||||
|
||||
@@ -9,22 +9,24 @@ ROOT_IMAGE ?= alpine:3.18.2
|
||||
# TODO: sync it with ROOT_IMAGE when it will be fixed in the new alpine releases
|
||||
CERTS_IMAGE := alpine:3.17.3
|
||||
|
||||
GO_BUILDER_IMAGE := golang:1.20.6-alpine
|
||||
GO_BUILDER_IMAGE := golang:1.20.7-alpine
|
||||
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr :/ __)-1
|
||||
BASE_IMAGE := local/base:1.1.4-$(shell echo $(ROOT_IMAGE) | tr :/ __)-$(shell echo $(CERTS_IMAGE) | tr :/ __)
|
||||
DOCKER_BUILD ?= docker build
|
||||
DOCKER_COMPOSE ?= docker compose
|
||||
DOCKER_IMAGE_LS ?= docker image ls --format '{{.Repository}}:{{.Tag}}'
|
||||
|
||||
package-base:
|
||||
(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(BASE_IMAGE)$$') \
|
||||
|| docker build \
|
||||
($(DOCKER_IMAGE_LS) | grep -q '$(BASE_IMAGE)$$') \
|
||||
|| $(DOCKER_BUILD) \
|
||||
--build-arg root_image=$(ROOT_IMAGE) \
|
||||
--build-arg certs_image=$(CERTS_IMAGE) \
|
||||
--tag $(BASE_IMAGE) \
|
||||
deployment/docker/base
|
||||
|
||||
package-builder:
|
||||
(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(BUILDER_IMAGE)$$') \
|
||||
|| docker build \
|
||||
($(DOCKER_IMAGE_LS) | grep -q '$(BUILDER_IMAGE)$$') \
|
||||
|| $(DOCKER_BUILD) \
|
||||
--build-arg go_builder_image=$(GO_BUILDER_IMAGE) \
|
||||
--tag $(BUILDER_IMAGE) \
|
||||
deployment/docker/builder
|
||||
@@ -60,9 +62,9 @@ app-via-docker-windows: package-builder
|
||||
-o bin/$(APP_NAME)-windows$(APP_SUFFIX)-prod.exe $(PKG_PREFIX)/app/$(APP_NAME)
|
||||
|
||||
package-via-docker: package-base
|
||||
(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE)$$') || (\
|
||||
($(DOCKER_IMAGE_LS) | grep -q '$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE)$$') || (\
|
||||
$(MAKE) app-via-docker && \
|
||||
docker build \
|
||||
$(DOCKER_BUILD) \
|
||||
--build-arg src_binary=$(APP_NAME)$(APP_SUFFIX)-prod \
|
||||
--build-arg base_image=$(BASE_IMAGE) \
|
||||
--tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) \
|
||||
@@ -170,7 +172,7 @@ package-via-docker-386:
|
||||
GOARCH=386 $(MAKE) package-via-docker-goarch-nocgo
|
||||
|
||||
remove-docker-images:
|
||||
docker image ls --format '{{.Repository}}\t{{.ID}}' | awk '{print $$2}' | xargs docker image rm -f
|
||||
docker image ls --format '{{.ID}}' | xargs docker image rm -f
|
||||
|
||||
docker-single-up:
|
||||
$(DOCKER_COMPOSE) -f deployment/docker/docker-compose.yml up -d
|
||||
|
||||
@@ -109,3 +109,22 @@ Grafana is provisioned by default with following entities:
|
||||
* `VictoriaMetrics - vmalert` dashboard
|
||||
|
||||
Remember to pick `VictoriaMetrics - cluster` datasource when viewing `VictoriaMetrics - cluster` dashboard.
|
||||
|
||||
## Alerts
|
||||
|
||||
See below a list of recommended alerting rules for various VictoriaMetrics components for running in production.
|
||||
Some of the alerting rules thresholds are just recommendations and could require an adjustment. The list
|
||||
of alerting rules is the following:
|
||||
* [alerts-health.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml):
|
||||
alerting rules related to all VictoriaMetrics components for tracking their "health" state;
|
||||
* [alerts.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml):
|
||||
alerting rules related to [single-server VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html) installation;
|
||||
* [alerts-cluster.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml):
|
||||
alerting rules related to [cluster version of VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html);
|
||||
* [alerts-vmagent.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmagent.yml):
|
||||
alerting rules related to [vmagent](https://docs.victoriametrics.com/vmagent.html) component;
|
||||
* [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml):
|
||||
alerting rules related to [vmalert](https://docs.victoriametrics.com/vmalert.html) component;
|
||||
|
||||
Please, also see [how to monitor](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#monitoring)
|
||||
VictoriaMetrics installations.
|
||||
|
||||
@@ -80,18 +80,6 @@ groups:
|
||||
description: "RPC errors are interconnection errors between cluster components.\n
|
||||
Possible reasons for errors are misconfiguration, overload, network blips or unreachable components."
|
||||
|
||||
- alert: ConcurrentFlushesHitTheLimit
|
||||
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
show_at: dashboard
|
||||
annotations:
|
||||
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}"
|
||||
summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
|
||||
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
|
||||
When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU."
|
||||
|
||||
- alert: RowsRejectedOnIngestion
|
||||
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
|
||||
for: 15m
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
# File contains default list of alerts for VM components.
|
||||
# File contains default list of alerts for various VM components.
|
||||
# The following alerts are recommended for use for any VM installation.
|
||||
# The alerts below are just recommendations and may require some updates
|
||||
# and threshold calibration according to every specific setup.
|
||||
groups:
|
||||
@@ -73,3 +74,16 @@ groups:
|
||||
description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).\n
|
||||
Make sure you're running VictoriaMetrics of v1.85.3 or higher.\n
|
||||
Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
|
||||
|
||||
- alert: ConcurrentInsertsHitTheLimit
|
||||
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
|
||||
description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
|
||||
Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
|
||||
In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
|
||||
making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then
|
||||
it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."
|
||||
|
||||
@@ -60,18 +60,6 @@ groups:
|
||||
description: "Requests to path {{ $labels.path }} are receiving errors.
|
||||
Please verify if clients are sending correct requests."
|
||||
|
||||
- alert: ConcurrentFlushesHitTheLimit
|
||||
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
show_at: dashboard
|
||||
annotations:
|
||||
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=59&var-instance={{ $labels.instance }}"
|
||||
summary: "VictoriaMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
|
||||
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
|
||||
When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU."
|
||||
|
||||
- alert: RowsRejectedOnIngestion
|
||||
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
|
||||
for: 15m
|
||||
|
||||
@@ -2,7 +2,7 @@ version: '3.5'
|
||||
services:
|
||||
vmagent:
|
||||
container_name: vmagent
|
||||
image: victoriametrics/vmagent:v1.92.0
|
||||
image: victoriametrics/vmagent:v1.92.1
|
||||
depends_on:
|
||||
- "vminsert"
|
||||
ports:
|
||||
@@ -32,7 +32,7 @@ services:
|
||||
|
||||
vmstorage-1:
|
||||
container_name: vmstorage-1
|
||||
image: victoriametrics/vmstorage:v1.92.0-cluster
|
||||
image: victoriametrics/vmstorage:v1.92.1-cluster
|
||||
ports:
|
||||
- 8482
|
||||
- 8400
|
||||
@@ -44,7 +44,7 @@ services:
|
||||
restart: always
|
||||
vmstorage-2:
|
||||
container_name: vmstorage-2
|
||||
image: victoriametrics/vmstorage:v1.92.0-cluster
|
||||
image: victoriametrics/vmstorage:v1.92.1-cluster
|
||||
ports:
|
||||
- 8482
|
||||
- 8400
|
||||
@@ -56,7 +56,7 @@ services:
|
||||
restart: always
|
||||
vminsert:
|
||||
container_name: vminsert
|
||||
image: victoriametrics/vminsert:v1.92.0-cluster
|
||||
image: victoriametrics/vminsert:v1.92.1-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -68,7 +68,7 @@ services:
|
||||
restart: always
|
||||
vmselect:
|
||||
container_name: vmselect
|
||||
image: victoriametrics/vmselect:v1.92.0-cluster
|
||||
image: victoriametrics/vmselect:v1.92.1-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -82,7 +82,7 @@ services:
|
||||
|
||||
vmalert:
|
||||
container_name: vmalert
|
||||
image: victoriametrics/vmalert:v1.92.0
|
||||
image: victoriametrics/vmalert:v1.92.1
|
||||
depends_on:
|
||||
- "vmselect"
|
||||
ports:
|
||||
|
||||
@@ -2,7 +2,7 @@ version: "3.5"
|
||||
services:
|
||||
vmagent:
|
||||
container_name: vmagent
|
||||
image: victoriametrics/vmagent:v1.92.0
|
||||
image: victoriametrics/vmagent:v1.92.1
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -18,7 +18,7 @@ services:
|
||||
restart: always
|
||||
victoriametrics:
|
||||
container_name: victoriametrics
|
||||
image: victoriametrics/victoria-metrics:v1.92.0
|
||||
image: victoriametrics/victoria-metrics:v1.92.1
|
||||
ports:
|
||||
- 8428:8428
|
||||
- 8089:8089
|
||||
@@ -56,7 +56,7 @@ services:
|
||||
restart: always
|
||||
vmalert:
|
||||
container_name: vmalert
|
||||
image: victoriametrics/vmalert:v1.92.0
|
||||
image: victoriametrics/vmalert:v1.92.1
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
- "alertmanager"
|
||||
|
||||
@@ -6,7 +6,7 @@ positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://vlogs:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app,pid
|
||||
- url: http://vlogs:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app
|
||||
tenant_id: "0:0"
|
||||
|
||||
scrape_configs:
|
||||
|
||||
@@ -20,11 +20,6 @@ services:
|
||||
condition: service_healthy
|
||||
victoriametrics:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8686/health"]
|
||||
interval: 1s
|
||||
timeout: 1s
|
||||
retries: 10
|
||||
|
||||
# Run `make package-victoria-logs` to build victoria-logs image
|
||||
victorialogs:
|
||||
|
||||
@@ -105,7 +105,7 @@ services:
|
||||
- '--config=/config.yml'
|
||||
|
||||
vmsingle:
|
||||
image: victoriametrics/victoria-metrics:v1.92.0
|
||||
image: victoriametrics/victoria-metrics:v1.92.1
|
||||
ports:
|
||||
- '8428:8428'
|
||||
command:
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
4. Set variables `DIGITALOCEAN_API_TOKEN` with `VM_VERSION` for `packer` environment and run make from example below:
|
||||
|
||||
```console
|
||||
make release-victoria-metrics-digitalocean-oneclick-droplet DIGITALOCEAN_API_TOKEN="dop_v23_2e46f4759ceeeba0d0248" VM_VERSION="1.92.0"
|
||||
make release-victoria-metrics-digitalocean-oneclick-droplet DIGITALOCEAN_API_TOKEN="dop_v23_2e46f4759ceeeba0d0248" VM_VERSION="1.92.1"
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -19,8 +19,8 @@ On the server:
|
||||
* VictoriaMetrics is running on ports: 8428, 8089, 4242, 2003 and they are bound to the local interface.
|
||||
|
||||
********************************************************************************
|
||||
# This image includes 1.92.0 version of VictoriaMetrics.
|
||||
# See Release notes https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0
|
||||
# This image includes 1.92.1 version of VictoriaMetrics.
|
||||
# See Release notes https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.1
|
||||
|
||||
# Welcome to VictoriaMetrics droplet!
|
||||
|
||||
|
||||
@@ -24,6 +24,32 @@ The following `tip` changes can be tested by building VictoriaMetrics components
|
||||
|
||||
## tip
|
||||
|
||||
* SECURITY: upgrade Go builder from Go1.20.6 to Go1.20.7. The update includes a security fix to the crypto/tls package, as well as bug fixes to the assembler and the compiler. See [the list of issues addressed in Go1.20.7](https://github.com/golang/go/issues?q=milestone%3AGo1.20.7+label%3ACherryPickApproved).
|
||||
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `share_eq_over_time(m[d], eq)` function for calculating the share (in the range `[0...1]`) of raw samples on the given lookbehind window `d`, which are equal to `eq`. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4441). Thanks to @Damon07 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4725).
|
||||
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): allow configuring deadline for a backend to be excluded from the rotation on errors via `-failTimeout` cmd-line flag. This feature could be useful when it is expected for backends to be not available for significant periods of time. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4415) for details. Thanks to @SunKyu for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4416).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): remove deprecated in [v1.61.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.61.0) `-rule.configCheckInterval` command-line flag. Use `-configCheckInterval` command-line flag instead.
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): remove support of deprecated web links of `/api/v1/<groupID>/<alertID>/status` form in favour of `/api/v1/alerts?group_id=<>&alert_id=<>` links. Links of `/api/v1/<groupID>/<alertID>/status` form were deprecated in v1.79.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2825) for details.
|
||||
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): allow disabling binary export API protocol via `-vm-native-disable-binary-protocol` cmd-line flag when [migrating data from VictoriaMetrics](https://docs.victoriametrics.com/vmctl.html#migrating-data-from-victoriametrics). Disabling binary protocol can be useful for deduplication of the exported data before ingestion. For this, deduplication need [to be configured](https://docs.victoriametrics.com/#deduplication) at `-vm-native-src-addr` side and `-vm-native-disable-binary-protocol` should be set on vmctl side.
|
||||
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add support of `week` step for [time-based chunking migration](https://docs.victoriametrics.com/vmctl.html#using-time-based-chunking-of-migration). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4738).
|
||||
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): do not add `/api/v1/read` suffix to remote read storage address defined by `--remote-read-src-addr` if a `--remote-read-disable-path-append` command-line flag is set. It allows an overriding path for remote-read API via `--remote-read-src-addr`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4655).
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add warning in query field of vmui for partial data responses. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4721).
|
||||
* FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add `Concurrent inserts` panel to vmagent's dasbhoard. The new panel supposed to show whether the number of concurrent inserts processed by vmagent isn't reaching the limit.
|
||||
* FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add panels for absolute Mem and CPU usage by vmalert. See related issue [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4627).
|
||||
* FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): correctly calculate `Bytes per point` value for single-server and cluster VM dashboards. Before, the calculation mistakenly accounted for the number of entries in indexdb in denominator, which could have shown lower values than expected.
|
||||
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
|
||||
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543).
|
||||
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).
|
||||
|
||||
|
||||
## [v1.92.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.1)
|
||||
|
||||
Released at 2023-07-28
|
||||
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): revert unit test feature for alerting and recording rules introduced in [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4596). See the following [change](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4734).
|
||||
|
||||
## [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0)
|
||||
|
||||
Released at 2023-07-27
|
||||
@@ -40,6 +66,8 @@ The previous behavior can be restored in the following ways:
|
||||
- by passing `-streamAggr.dropInput` command-line flag to single-node VictoriaMetrics;
|
||||
- by passing `-remoteWrite.streamAggr.dropInput` command-line flag per each configured `-remoteWrite.streamAggr.config` at `vmagent`.
|
||||
|
||||
---
|
||||
|
||||
* SECURITY: upgrade base docker image (alpine) from 3.18.0 to 3.18.2. See [alpine 3.18.2 release notes](https://alpinelinux.org/posts/Alpine-3.15.9-3.16.6-3.17.4-3.18.2-released.html).
|
||||
* SECURITY: upgrade Go builder from Go1.20.5 to Go1.20.6. See [the list of issues addressed in Go1.20.6](https://github.com/golang/go/issues?q=milestone%3AGo1.20.6+label%3ACherryPickApproved).
|
||||
|
||||
@@ -2117,7 +2145,7 @@ in front of VictoriaMetrics. [Contact us](mailto:sales@victoriametrics.com) if y
|
||||
|
||||
Released at 2021-01-13
|
||||
|
||||
* FEATURE: provide a sample list of alerting rules for VictoriaMetrics components. It is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
|
||||
* FEATURE: provide a sample list of alerting rules for VictoriaMetrics components. It is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts).
|
||||
* FEATURE: disable final merge for data for the previous month at the beginning of new month, since it may result in high disk IO and CPU usage. Final merge can be enabled by setting `-finalMergeDelay` command-line flag to positive duration.
|
||||
* FEATURE: add `tfirst_over_time(m[d])` and `tlast_over_time(m[d])` functions to [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) for returning timestamps for the first and the last data point in `m` over `d` duration.
|
||||
* FEATURE: add ability to pass multiple labels to `sort_by_label()` and `sort_by_label_desc()` functions. See <https://github.com/VictoriaMetrics/VictoriaMetrics/issues/992> .
|
||||
|
||||
@@ -30,6 +30,7 @@ where you can chat with VictoriaMetrics users to get additional references, revi
|
||||
- [Groove X](#groove-x)
|
||||
- [Idealo.de](#idealode)
|
||||
- [MHI Vestas Offshore Wind](#mhi-vestas-offshore-wind)
|
||||
- [Naver][#naver]
|
||||
- [Percona](#percona)
|
||||
- [Razorpay](#razorpay)
|
||||
- [Roblox](#roblox)
|
||||
@@ -420,6 +421,20 @@ Numbers with current, limited roll out:
|
||||
- Data size on disk: 800 GiB
|
||||
- Retention period: 3 years
|
||||
|
||||
## Naver
|
||||
|
||||
See [our](https://www.navercorp.com/en/) video ["Time Series in the Multiverse of Madness" (in Korean)](https://www.youtube.com/watch?v=OUyXPgVcdw4) about the comparison of Time Series Database, why we have chosen VictoriaMetrics
|
||||
We also covered the internals of the VictoriaMetrics data model and Cluster.
|
||||
The key areas:
|
||||
|
||||
* Explanation of the importance and role of Monitoring for NaverCorp
|
||||
* History overview of Time Series Databases
|
||||
* VictoriaMetrics Data model - read and write paths, index structure, compression and the crucial role of the churn rate
|
||||
* Time series in the Multiverse Madness
|
||||
* HA and Fault Tolerance - write without data loss, read for no downtime, management of Multiverse
|
||||
|
||||
[Slides](https://deview.kr/data/deview/session/attach/%5B2B4%5DVictoriaMetrics_%E1%84%89%E1%85%B5%E1%84%80%E1%85%A8%E1%84%8B%E1%85%A7%E1%86%AF_%E1%84%83%E1%85%A6%E1%84%8B%E1%85%B5%E1%84%90%E1%85%A5_%E1%84%83%E1%85%A2%E1%84%92%E1%85%A9%E1%86%AB%E1%84%83%E1%85%A9%E1%86%AB%E1%84%8B%E1%85%B4_%E1%84%86%E1%85%A5%E1%86%AF%E1%84%90%E1%85%B5%E1%84%87%E1%85%A5%E1%84%89%E1%85%B3_Kor+Eng.pdf) in English and Korean
|
||||
|
||||
## Percona
|
||||
|
||||
[Percona](https://www.percona.com/) is a leader in providing best-of-breed enterprise-class support, consulting, managed services, training and software for MySQL®, MariaDB®, MongoDB®, PostgreSQL® and other open source databases in on-premises and cloud environments.
|
||||
|
||||
@@ -294,7 +294,7 @@ or Prometheus to scrape `/metrics` pages from all the cluster components, so the
|
||||
with [the official Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176-victoriametrics-cluster/)
|
||||
or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11831). Graphs on these dashboards contain useful hints - hover the `i` icon at the top left corner of each graph in order to read it.
|
||||
|
||||
It is recommended setting up alerts in [vmalert](https://docs.victoriametrics.com/vmalert.html) or in Prometheus from [this config](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml).
|
||||
It is recommended setting up alerts in [vmalert](https://docs.victoriametrics.com/vmalert.html) or in Prometheus from [this list](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts).
|
||||
See more details in the article [VictoriaMetrics Monitoring](https://victoriametrics.com/blog/victoriametrics-monitoring/).
|
||||
|
||||
## Cardinality limiter
|
||||
@@ -340,7 +340,7 @@ Check practical examples of VictoriaMetrics API [here](https://docs.victoriametr
|
||||
- `prometheus/api/v1/import/native` - for importing data obtained via `api/v1/export/native` on `vmselect` (see below).
|
||||
- `prometheus/api/v1/import/csv` - for importing arbitrary CSV data. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-csv-data) for details.
|
||||
- `prometheus/api/v1/import/prometheus` - for importing data in [Prometheus text exposition format](https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md#text-based-format) and in [OpenMetrics format](https://github.com/OpenObservability/OpenMetrics/blob/master/specification/OpenMetrics.md). This endpoint also supports [Pushgateway protocol](https://github.com/prometheus/pushgateway#url). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
|
||||
- `opentemetry/api/v1/push` - for ingesting data via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#sending-data-via-opentelemetry).
|
||||
- `opentelemetry/api/v1/push` - for ingesting data via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#sending-data-via-opentelemetry).
|
||||
- `datadog/api/v1/series` - for ingesting data with [DataDog submit metrics API](https://docs.datadoghq.com/api/latest/metrics/#submit-metrics). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-datadog-agent) for details.
|
||||
- `influx/write` and `influx/api/v2/write` - for ingesting data with [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for details.
|
||||
- `opentsdb/api/put` - for accepting [OpenTSDB HTTP /api/put requests](http://opentsdb.net/docs/build/html/api_http/put.html). This handler is disabled by default. It is exposed on a distinct TCP address set via `-opentsdbHTTPListenAddr` command-line flag. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#sending-opentsdb-data-via-http-apiput-requests) for details.
|
||||
|
||||
@@ -744,6 +744,14 @@ Metric names are stripped from the resulting rollups. Add [keep_metric_names](#k
|
||||
|
||||
See also [share_gt_over_time](#share_gt_over_time).
|
||||
|
||||
#### share_eq_over_time
|
||||
|
||||
`share_eq_over_time(series_selector[d], eq)` is a [rollup function](#rollup-functions), which returns share (in the range `[0...1]`) of raw samples
|
||||
on the given lookbehind window `d`, which are equal to `eq`. It is calculated independently per each time series returned
|
||||
from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering).
|
||||
|
||||
Metric names are stripped from the resulting rollups. Add [keep_metric_names](#keep_metric_names) modifier in order to keep metric names.
|
||||
|
||||
#### stale_samples_over_time
|
||||
|
||||
`stale_samples_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which calculates the number
|
||||
|
||||
@@ -145,8 +145,7 @@ VictoriaMetric team prepared a list of [Grafana dashboards](https://grafana.com/
|
||||
for the main components. Each dashboard contains a lot of useful information and tips. It is recommended
|
||||
to have these dashboards installed and up to date.
|
||||
|
||||
The list of alerts for [single](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml)
|
||||
Using the [recommended alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
versions would also help to identify and notify about issues with the system.
|
||||
|
||||
The rule of thumb is to have a separate installation of VictoriaMetrics or any other monitoring system
|
||||
|
||||
@@ -120,6 +120,7 @@ Case studies:
|
||||
* [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x)
|
||||
* [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode)
|
||||
* [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind)
|
||||
* [Naver](https://docs.victoriametrics.com/CaseStudies.html#naver)
|
||||
* [Razorpay](https://docs.victoriametrics.com/CaseStudies.html#razorpay)
|
||||
* [Percona](https://docs.victoriametrics.com/CaseStudies.html#percona)
|
||||
* [Roblox](https://docs.victoriametrics.com/CaseStudies.html#roblox)
|
||||
@@ -1363,7 +1364,7 @@ VictoriaMetrics also may scrape Prometheus targets - see [these docs](#how-to-sc
|
||||
|
||||
## Sending data via OpenTelemetry
|
||||
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentemetry/api/v1/push` path.
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentelemetry/api/v1/push` path.
|
||||
|
||||
VictoriaMetrics expects `protobuf`-encoded requests at `/opentelemetry/api/v1/push`.
|
||||
Set HTTP request header `Content-Encoding: gzip` when sending gzip-compressed data to `/opentelemetry/api/v1/push`.
|
||||
@@ -1784,7 +1785,7 @@ created by community.
|
||||
|
||||
Graphs on the dashboards contain useful hints - hover the `i` icon in the top left corner of each graph to read it.
|
||||
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
via [vmalert](https://docs.victoriametrics.com/vmalert.html) or via Prometheus.
|
||||
|
||||
VictoriaMetrics exposes currently running queries and their execution times at `/api/v1/status/active_queries` page.
|
||||
|
||||
@@ -73,7 +73,7 @@ and make it merged. See example in this [commit](https://github.com/VictoriaMetr
|
||||
* linux/ppc64le
|
||||
* linux/386
|
||||
This step can be run manually with the command `make publish` from the needed git tag.
|
||||
1. Push the tags `v1.xx.y` and `v1.xx.y-cluster` created at step 2 to public GitHub repository at https://github.com/VictoriaMetrics/VictoriaMetrics.
|
||||
1. Push the tags `v1.xx.y` and `v1.xx.y-cluster` created at step 8 to public GitHub repository at https://github.com/VictoriaMetrics/VictoriaMetrics.
|
||||
Push the tags `v1.xx.y`, `v1.xx.y-cluster`, `v1.xx.y-enterprise` and `v1.xx.y-enterprise-cluster` to the corresponding
|
||||
branches in private repository.
|
||||
**Important note:** do not push enterprise tags to public GitHub repository - they must be pushed only to private repository.
|
||||
@@ -81,7 +81,7 @@ and make it merged. See example in this [commit](https://github.com/VictoriaMetr
|
||||
a) Create draft GitHub release with the name `TAG`. This step can be run manually
|
||||
with the command `TAG=v1.xx.y make github-create-release`.
|
||||
The release id is stored at `/tmp/vm-github-release` file.
|
||||
b) Upload all the binaries and checksums created at step `3a` to that release.
|
||||
b) Upload all the binaries and checksums created at step `9a` to that release.
|
||||
This step can be run manually with the command `make github-upload-assets`.
|
||||
It is expected that the needed release id is stored at `/tmp/vm-github-release` file,
|
||||
which must be created at the step `a`.
|
||||
@@ -98,6 +98,7 @@ and make it merged. See example in this [commit](https://github.com/VictoriaMetr
|
||||
1. Bump version of the VictoriaMetrics cluster in the [sandbox environment](https://github.com/VictoriaMetrics/ops/blob/main/gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml)
|
||||
by [opening and merging PR](https://github.com/VictoriaMetrics/ops/pull/58).
|
||||
1. Bump VictoriaMetrics version at `deployment/docker/docker-compose.yml` and at `deployment/docker/docker-compose-cluster.yml`.
|
||||
1. Follow the instructions in [release follow-up](https://github.com/VictoriaMetrics/VictoriaMetrics-enterprise/blob/master/Release-Guide.md).
|
||||
|
||||
## Building snap package
|
||||
|
||||
|
||||
@@ -128,6 +128,7 @@ Case studies:
|
||||
* [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x)
|
||||
* [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode)
|
||||
* [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind)
|
||||
* [Naver](https://docs.victoriametrics.com/CaseStudies.html#naver)
|
||||
* [Razorpay](https://docs.victoriametrics.com/CaseStudies.html#razorpay)
|
||||
* [Percona](https://docs.victoriametrics.com/CaseStudies.html#percona)
|
||||
* [Roblox](https://docs.victoriametrics.com/CaseStudies.html#roblox)
|
||||
@@ -1371,7 +1372,7 @@ VictoriaMetrics also may scrape Prometheus targets - see [these docs](#how-to-sc
|
||||
|
||||
## Sending data via OpenTelemetry
|
||||
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentemetry/api/v1/push` path.
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentelemetry/api/v1/push` path.
|
||||
|
||||
VictoriaMetrics expects `protobuf`-encoded requests at `/opentelemetry/api/v1/push`.
|
||||
Set HTTP request header `Content-Encoding: gzip` when sending gzip-compressed data to `/opentelemetry/api/v1/push`.
|
||||
@@ -1792,7 +1793,7 @@ created by community.
|
||||
|
||||
Graphs on the dashboards contain useful hints - hover the `i` icon in the top left corner of each graph to read it.
|
||||
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
via [vmalert](https://docs.victoriametrics.com/vmalert.html) or via Prometheus.
|
||||
|
||||
VictoriaMetrics exposes currently running queries and their execution times at `/api/v1/status/active_queries` page.
|
||||
|
||||
@@ -414,9 +414,8 @@ would help identify and prevent most of the issues listed above.
|
||||
[Grafana dashboards](https://grafana.com/orgs/victoriametrics/dashboards) contain panels reflecting the
|
||||
health state, resource usage and other specific metrics for VictoriaMetrics components.
|
||||
|
||||
Alerting rules for [single-node](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml) versions
|
||||
of VictoriaMetrics will notify about issues with Victoriametrics components and provide recommendations for how to solve them.
|
||||
The list of [recommended alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
for VictoriaMetrics components will notify about issues and provide recommendations for how to solve them.
|
||||
|
||||
Internally, we heavily rely both on dashboards and alerts, and constantly improve them.
|
||||
It is important to stay up to date with such changes.
|
||||
|
||||
@@ -5,6 +5,10 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta
|
||||
|
||||
## tip
|
||||
|
||||
* FEATURE: expose the following metrics at [/metrics](monitoring) page:
|
||||
* `vl_data_size_bytes{type="storage"}` - on-disk size for data excluding [log stream](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) indexes.
|
||||
* `vl_data_size_bytes{type="indexdb"}` - on-disk size for [log stream](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) indexes.
|
||||
|
||||
## [v0.3.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.3.0-victorialogs)
|
||||
|
||||
Released at 2023-07-20
|
||||
|
||||
@@ -143,6 +143,7 @@ Here are a Docker-compose demos, which start VictoriaLogs and push logs to it vi
|
||||
- [Fluentbit demo](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/victorialogs/fluentbit-docker)
|
||||
- [Logstash demo](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/victorialogs/logstash)
|
||||
- [Vector demo](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/victorialogs/vector-docker)
|
||||
- [Promtail demo](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/victorialogs/promtail)
|
||||
|
||||
You can use [this Helm chart](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-logs-single/README.md)
|
||||
as a demo for running Fluentbit in Kubernetes with VictoriaLogs.
|
||||
|
||||
@@ -29,9 +29,9 @@ See [these docs](https://docs.victoriametrics.com/VictoriaLogs/) for details.
|
||||
The following functionality is planned in the future versions of VictoriaLogs:
|
||||
|
||||
- Support for [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/) from popular log collectors and formats:
|
||||
- Promtail (aka Grafana Loki)
|
||||
- Fluentd
|
||||
- Syslog
|
||||
- Journald (systemd)
|
||||
- Add missing functionality to [LogsQL](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html):
|
||||
- [Stream context](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-context).
|
||||
- [Transformation functions](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#transformations).
|
||||
|
||||
@@ -8,12 +8,18 @@ for sending the collected logs to [VictoriaLogs](https://docs.victoriametrics.co
|
||||
|
||||
```yaml
|
||||
clients:
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app,pid
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app
|
||||
```
|
||||
|
||||
Substitute `localhost:9428` address inside `clients` with the real TCP address of VictoriaLogs.
|
||||
|
||||
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/#http-parameters) for details on the used URL query parameter section.
|
||||
By default VictoriaLogs stores all the ingested logs into a single [log stream](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields).
|
||||
Storing all the logs in a single log stream may be not so efficient, so it is recommended to specify `_stream_fields` query arg
|
||||
with the list of labels, which uniquely identify log streams. There is no need in specifying all the labels Promtail generates there -
|
||||
it is usually enough specifying `instance` and `job` labels. See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields)
|
||||
for details.
|
||||
|
||||
See also [these docs](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/#http-parameters) for details on other supported query args.
|
||||
There is no need in specifying `_msg_field` and `_time_field` query args, since VictoriaLogs automatically extracts log message and timestamp from the ingested Loki data.
|
||||
|
||||
It is recommended verifying whether the initial setup generates the needed [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model)
|
||||
@@ -23,7 +29,7 @@ and inspecting VictoriaLogs logs then:
|
||||
|
||||
```yaml
|
||||
clients:
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app,pid&debug=1
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app&debug=1
|
||||
```
|
||||
|
||||
If some [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) must be skipped
|
||||
@@ -32,7 +38,7 @@ For example, the following config instructs VictoriaLogs to ignore `filename` an
|
||||
|
||||
```yaml
|
||||
clients:
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app,pid&ignore_fields=filename,stream
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app&ignore_fields=filename,stream
|
||||
```
|
||||
|
||||
By default the ingested logs are stored in the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/#multitenancy).
|
||||
@@ -43,7 +49,7 @@ For example, the following config instructs VictoriaLogs to store logs in the `(
|
||||
|
||||
```yaml
|
||||
clients:
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app,pid&debug=1
|
||||
- url: http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance,job,host,app&debug=1
|
||||
tenant_id: "12:34"
|
||||
```
|
||||
|
||||
|
||||
@@ -255,9 +255,10 @@ VictoriaLogs exposes various [metrics](https://docs.victoriametrics.com/Victoria
|
||||
|
||||
Here is the list of log collectors and their ingestion formats supported by VictoriaLogs:
|
||||
|
||||
| How to setup the collector | Format: Elasticsearch | Format: JSON Stream |
|
||||
|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|---------------------------------------------------------------|
|
||||
| [Filebeat](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Filebeat.html) | [Yes](https://www.elastic.co/guide/en/beats/filebeat/current/elasticsearch-output.html) | No |
|
||||
| [Fluentbit](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Fluentbit.html) | No | [Yes](https://docs.fluentbit.io/manual/pipeline/outputs/http) |
|
||||
| [Logstash](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Logstash.html) | [Yes](https://www.elastic.co/guide/en/logstash/current/plugins-outputs-elasticsearch.html) | No |
|
||||
| [Vector](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Vector.html) | [Yes](https://vector.dev/docs/reference/configuration/sinks/elasticsearch/) | No |
|
||||
| How to setup the collector | Format: Elasticsearch | Format: JSON Stream | Format: Loki |
|
||||
|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------|
|
||||
| [Filebeat](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Filebeat.html) | [Yes](https://www.elastic.co/guide/en/beats/filebeat/current/elasticsearch-output.html) | No | No |
|
||||
| [Fluentbit](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Fluentbit.html) | No | [Yes](https://docs.fluentbit.io/manual/pipeline/outputs/http) | [Yes](https://docs.fluentbit.io/manual/pipeline/outputs/loki) |
|
||||
| [Logstash](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Logstash.html) | [Yes](https://www.elastic.co/guide/en/logstash/current/plugins-outputs-elasticsearch.html) | No | No |
|
||||
| [Vector](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Vector.html) | [Yes](https://vector.dev/docs/reference/configuration/sinks/elasticsearch/) | No | [Yes](https://vector.dev/docs/reference/configuration/sinks/loki/) |
|
||||
| [Promtail](https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/Promtail.html) | No | No | [Yes](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#clients) |
|
||||
|
||||
@@ -75,10 +75,8 @@ You can set up vmalert in each Ground control region that evaluates recording an
|
||||
|
||||
For alert deduplication, please use [cluster mode in Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/#high-availability).
|
||||
|
||||
We also recommend adopting these alerts:
|
||||
|
||||
* VictoriaMetrics Single - [https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml)
|
||||
* VictoriaMetrics Cluster - [https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml)
|
||||
We also recommend adopting the list of [alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
for VictoriaMetrics components.
|
||||
|
||||
### Monitoring
|
||||
|
||||
|
||||
1444
docs/operator/CHANGELOG.md
Normal file
1444
docs/operator/CHANGELOG.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -63,6 +63,7 @@ This Document documents the types introduced by the VictoriaMetrics to be consum
|
||||
* [EmbeddedPodDisruptionBudgetSpec](#embeddedpoddisruptionbudgetspec)
|
||||
* [EmbeddedProbes](#embeddedprobes)
|
||||
* [HTTPAuth](#httpauth)
|
||||
* [KeyValue](#keyvalue)
|
||||
* [ServiceSpec](#servicespec)
|
||||
* [StorageSpec](#storagespec)
|
||||
* [StreamAggrConfig](#streamaggrconfig)
|
||||
@@ -122,12 +123,14 @@ This Document documents the types introduced by the VictoriaMetrics to be consum
|
||||
* [StaticRef](#staticref)
|
||||
* [TargetRef](#targetref)
|
||||
* [VMUser](#vmuser)
|
||||
* [VMUserIPFilters](#vmuseripfilters)
|
||||
* [VMUserList](#vmuserlist)
|
||||
* [VMUserSpec](#vmuserspec)
|
||||
* [EmbeddedIngress](#embeddedingress)
|
||||
* [VMAuth](#vmauth)
|
||||
* [VMAuthList](#vmauthlist)
|
||||
* [VMAuthSpec](#vmauthspec)
|
||||
* [VMAuthUnauthorizedPath](#vmauthunauthorizedpath)
|
||||
* [TargetEndpoint](#targetendpoint)
|
||||
* [VMStaticScrape](#vmstaticscrape)
|
||||
* [VMStaticScrapeList](#vmstaticscrapelist)
|
||||
@@ -256,7 +259,7 @@ EmailConfig configures notifications via Email.
|
||||
| auth_password | AuthPassword defines secret name and key at CRD namespace. | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| auth_secret | AuthSecret defines secrent name and key at CRD namespace. It must contain the CRAM-MD5 secret. | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| auth_identity | The identity to use for authentication. | string | false |
|
||||
| headers | Further headers email header key/value pairs. Overrides any headers previously set by the notification implementation. | map[string]string | false |
|
||||
| headers | Further headers email header key/value pairs. Overrides any headers previously set by the notification implementation. | EmailConfigHeaders | false |
|
||||
| html | The HTML body of the email notification. | string | false |
|
||||
| text | The text body of the email notification. | string | false |
|
||||
| require_tls | The SMTP TLS requirement. Note that Go does not support unencrypted connections to remote SMTP endpoints. | *bool | false |
|
||||
@@ -504,7 +507,7 @@ SlackConfirmationField protect users from destructive actions or particularly di
|
||||
|
||||
## SlackField
|
||||
|
||||
See https://api.slack.com/docs/message-attachments#fields for more information.
|
||||
SlackField configures a single Slack field that is sent with each notification. See https://api.slack.com/docs/message-attachments#fields for more information.
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
@@ -807,7 +810,7 @@ VMAgentSpec defines the desired state of VMAgent
|
||||
|
||||
## VMAgentStatus
|
||||
|
||||
VmAgentStatus defines the observed state of VmAgent
|
||||
VMAgentStatus defines the observed state of VmAgent
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
@@ -838,7 +841,7 @@ BearerAuth defines auth with bearer token
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
@@ -932,14 +935,25 @@ HTTPAuth generic auth used with http protocols
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| basicAuth | | *[BasicAuth](#basicauth) | false |
|
||||
| OAuth2 | | *[OAuth2](#oauth2) | false |
|
||||
| oauth2 | | *[OAuth2](#oauth2) | false |
|
||||
| tlsConfig | | *[TLSConfig](#tlsconfig) | false |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| headers | Headers allow configuring custom http headers Must be in form of semicolon separated header with value e.g. headerName:headerValue vmalert supports it since 1.79.0 version | []string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
## KeyValue
|
||||
|
||||
KeyValue defines a (key, value) tuple.
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| key | Key of the tuple. | string | true |
|
||||
| value | Value of the tuple. | string | true |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
## ServiceSpec
|
||||
|
||||
ServiceSpec defines additional service for CRD with user-defined params. by default, some of fields can be inherited from default service definition for the CRD: labels,selector, ports. if metadata.name is not defined, service will have format {{CRD_TYPE}}-{{CRD_NAME}}-additional-service.
|
||||
@@ -1005,15 +1019,15 @@ VMAlert executes a list of given alerting or recording rules against configured
|
||||
|
||||
## VMAlertDatasourceSpec
|
||||
|
||||
VMAgentRemoteReadSpec defines the remote storage configuration for VmAlert to read alerts from
|
||||
VMAlertDatasourceSpec defines the remote storage configuration for VmAlert to read alerts from
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| url | Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428 | string | true |
|
||||
| basicAuth | | *[BasicAuth](#basicauth) | false |
|
||||
| OAuth2 | | *[OAuth2](#oauth2) | false |
|
||||
| oauth2 | | *[OAuth2](#oauth2) | false |
|
||||
| tlsConfig | | *[TLSConfig](#tlsconfig) | false |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| headers | Headers allow configuring custom http headers Must be in form of semicolon separated header with value e.g. headerName:headerValue vmalert supports it since 1.79.0 version | []string | false |
|
||||
|
||||
@@ -1039,9 +1053,9 @@ VMAlertNotifierSpec defines the notifier url for sending information about alert
|
||||
| url | AlertManager url. E.g. http://127.0.0.1:9093 | string | false |
|
||||
| selector | Selector allows service discovery for alertmanager in this case all matched vmalertmanager replicas will be added into vmalert notifier.url as statefulset pod.fqdn | *[DiscoverySelector](#discoveryselector) | false |
|
||||
| basicAuth | | *[BasicAuth](#basicauth) | false |
|
||||
| OAuth2 | | *[OAuth2](#oauth2) | false |
|
||||
| oauth2 | | *[OAuth2](#oauth2) | false |
|
||||
| tlsConfig | | *[TLSConfig](#tlsconfig) | false |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| headers | Headers allow configuring custom http headers Must be in form of semicolon separated header with value e.g. headerName:headerValue vmalert supports it since 1.79.0 version | []string | false |
|
||||
|
||||
@@ -1049,16 +1063,16 @@ VMAlertNotifierSpec defines the notifier url for sending information about alert
|
||||
|
||||
## VMAlertRemoteReadSpec
|
||||
|
||||
VMAgentRemoteReadSpec defines the remote storage configuration for VmAlert to read alerts from
|
||||
VMAlertRemoteReadSpec defines the remote storage configuration for VmAlert to read alerts from
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| url | URL of the endpoint to send samples to. | string | true |
|
||||
| lookback | Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s) Applied only to RemoteReadSpec | *string | false |
|
||||
| basicAuth | | *[BasicAuth](#basicauth) | false |
|
||||
| OAuth2 | | *[OAuth2](#oauth2) | false |
|
||||
| oauth2 | | *[OAuth2](#oauth2) | false |
|
||||
| tlsConfig | | *[TLSConfig](#tlsconfig) | false |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| headers | Headers allow configuring custom http headers Must be in form of semicolon separated header with value e.g. headerName:headerValue vmalert supports it since 1.79.0 version | []string | false |
|
||||
|
||||
@@ -1066,7 +1080,7 @@ VMAgentRemoteReadSpec defines the remote storage configuration for VmAlert to re
|
||||
|
||||
## VMAlertRemoteWriteSpec
|
||||
|
||||
VMAgentRemoteWriteSpec defines the remote storage configuration for VmAlert
|
||||
VMAlertRemoteWriteSpec defines the remote storage configuration for VmAlert
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
@@ -1076,9 +1090,9 @@ VMAgentRemoteWriteSpec defines the remote storage configuration for VmAlert
|
||||
| maxBatchSize | Defines defines max number of timeseries to be flushed at once (default 1000) | *int32 | false |
|
||||
| maxQueueSize | Defines the max number of pending datapoints to remote write endpoint (default 100000) | *int32 | false |
|
||||
| basicAuth | | *[BasicAuth](#basicauth) | false |
|
||||
| OAuth2 | | *[OAuth2](#oauth2) | false |
|
||||
| oauth2 | | *[OAuth2](#oauth2) | false |
|
||||
| tlsConfig | | *[TLSConfig](#tlsconfig) | false |
|
||||
| bearerTokenFilePath | | string | false |
|
||||
| bearerTokenFile | Path to bearer token file | string | false |
|
||||
| bearerTokenSecret | Optional bearer auth token to use for -remoteWrite.url | *[v1.SecretKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#secretkeyselector-v1-core) | false |
|
||||
| headers | Headers allow configuring custom http headers Must be in form of semicolon separated header with value e.g. headerName:headerValue vmalert supports it since 1.79.0 version | []string | false |
|
||||
|
||||
@@ -1114,7 +1128,7 @@ VMAlertSpec defines the desired state of VMAlert
|
||||
| hostNetwork | HostNetwork controls whether the pod may use the node network namespace | bool | false |
|
||||
| dnsPolicy | DNSPolicy sets DNS policy for the pod | [v1.DNSPolicy](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#pod-v1-core) | false |
|
||||
| topologySpreadConstraints | TopologySpreadConstraints embedded kubernetes pod configuration option, controls how pods are spread across your cluster among failure-domains such as regions, zones, nodes, and other user-defined topology domains https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ | [][v1.TopologySpreadConstraint](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/) | false |
|
||||
| evaluationInterval | EvaluationInterval how often evalute rules by default | string | false |
|
||||
| evaluationInterval | EvaluationInterval defines how often to evaluate rules by default | string | false |
|
||||
| enforcedNamespaceLabel | EnforcedNamespaceLabel enforces adding a namespace label of origin for each alert and metric that is user created. The label value will always be the namespace of the object that is being created. | string | false |
|
||||
| selectAllByDefault | SelectAllByDefault changes default behavior for empty CRD selectors, such RuleSelector. with selectAllByDefault: true and empty serviceScrapeSelector and RuleNamespaceSelector Operator selects all exist serviceScrapes with selectAllByDefault: false - selects nothing | bool | false |
|
||||
| ruleSelector | RuleSelector selector to select which VMRules to mount for loading alerting rules from. Works in combination with NamespaceSelector. If both nil - behaviour controlled by selectAllByDefault NamespaceSelector nil - only objects at VMAlert namespace. | *[metav1.LabelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#labelselector-v1-meta) | false |
|
||||
@@ -1147,7 +1161,7 @@ VMAlertSpec defines the desired state of VMAlert
|
||||
|
||||
## VMAlertStatus
|
||||
|
||||
VmAlertStatus defines the observed state of VmAlert
|
||||
VMAlertStatus defines the observed state of VmAlert
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
@@ -1241,10 +1255,12 @@ VMSingleStatus defines the observed state of VMSingle
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| replicas | ReplicaCount Total number of non-terminated pods targeted by this VMAlert cluster (their labels match the selector). | int32 | true |
|
||||
| updatedReplicas | UpdatedReplicas Total number of non-terminated pods targeted by this VMAlert cluster that have the desired version spec. | int32 | true |
|
||||
| availableReplicas | AvailableReplicas Total number of available pods (ready for at least minReadySeconds) targeted by this VMAlert cluster. | int32 | true |
|
||||
| unavailableReplicas | UnavailableReplicas Total number of unavailable pods targeted by this VMAlert cluster. | int32 | true |
|
||||
| replicas | ReplicaCount Total number of non-terminated pods targeted by this VMSingle. | int32 | true |
|
||||
| updatedReplicas | UpdatedReplicas Total number of non-terminated pods targeted by this VMSingle. | int32 | true |
|
||||
| availableReplicas | AvailableReplicas Total number of available pods (ready for at least minReadySeconds) targeted by this VMSingle. | int32 | true |
|
||||
| unavailableReplicas | UnavailableReplicas Total number of unavailable pods targeted by this VMSingle. | int32 | true |
|
||||
| singleStatus | | SingleStatus | true |
|
||||
| reason | | string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1259,8 +1275,10 @@ Rule describes an alerting or recording rule.
|
||||
| expr | Expr is query, that will be evaluated at dataSource | string | true |
|
||||
| debug | Debug enables logging for rule it useful for tracking | *bool | false |
|
||||
| for | For evaluation interval in time.Duration format 30s, 1m, 1h or nanoseconds | string | false |
|
||||
| keep_firing_for | KeepFiringFor will make alert continue firing for this long even when the alerting expression no longer has results. Use time.Duration format, 30s, 1m, 1h or nanoseconds | string | false |
|
||||
| labels | Labels will be added to rule configuration | map[string]string | false |
|
||||
| annotations | Annotations will be added to rule configuration | map[string]string | false |
|
||||
| update_entries_limit | UpdateEntriesLimit defines max number of rule's state updates stored in memory. Overrides `-rule.updateEntriesLimit` in vmalert. | *int | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1281,6 +1299,7 @@ RuleGroup is a list of sequentially evaluated recording and alerting rules.
|
||||
| params | Params optional HTTP URL parameters added to each rule request | url.Values | false |
|
||||
| type | Type defines datasource type for enterprise version of vmalert possible values - prometheus,graphite | string | false |
|
||||
| headers | Headers contains optional HTTP headers added to each rule request Must be in form `header-name: value` For example:\n headers:\n - \"CustomHeader: foo\"\n - \"CustomHeader2: bar\" | []string | false |
|
||||
| notifier_headers | NotifierHeaders contains optional HTTP headers added to each alert request which will send to notifier Must be in form `header-name: value` For example:\n headers:\n - \"CustomHeader: foo\"\n - \"CustomHeader2: bar\" | []string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1572,6 +1591,7 @@ PodMetricsEndpoint defines a scrapeable endpoint of a Kubernetes Pod serving Pro
|
||||
| authorization | Authorization with http header Authorization | *[Authorization](#authorization) | false |
|
||||
| vm_scrape_params | VMScrapeParams defines VictoriaMetrics specific scrape parametrs | *[VMScrapeParams](#vmscrapeparams) | false |
|
||||
| attach_metadata | AttachMetadata configures metadata attaching from service discovery | [AttachMetadata](#attachmetadata) | false |
|
||||
| filterRunning | FilterRunning applies filter with pod status == running it prevents from scrapping metrics at failed or succeed state pods. enabled by default | *bool | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1969,7 +1989,8 @@ StaticRef - user-defined routing host address.
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| url | URL http url for given staticRef. | string | true |
|
||||
| url | URL http url for given staticRef. | string | false |
|
||||
| urls | URLs allows setting multiple urls for load-balancing at vmauth-side. | []string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1984,6 +2005,7 @@ TargetRef describes target for user traffic forwarding. one of target types can
|
||||
| paths | Paths - matched path to route. | []string | false |
|
||||
| target_path_suffix | QueryParams []string `json:\"queryParams,omitempty\"` TargetPathSuffix allows to add some suffix to the target path It allows to hide tenant configuration from user with crd as ref. it also may contain any url encoded params. | string | false |
|
||||
| headers | Headers represent additional http headers, that vmauth uses in form of [\"header_key: header_value\"] multiple values for header key: [\"header_key: value1,value2\"] it's available since 1.68.0 version of vmauth | []string | false |
|
||||
| ip_filters | IPFilters defines per target src ip filters supported only with enterprise version of vmauth https://docs.victoriametrics.com/vmauth.html#ip-filters | [VMUserIPFilters](#vmuseripfilters) | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -1999,6 +2021,17 @@ VMUser is the Schema for the vmusers API
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
## VMUserIPFilters
|
||||
|
||||
VMUserIPFilters defines filters for IP addresses supported only with enterprise version of vmauth https://docs.victoriametrics.com/vmauth.html#ip-filters
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| deny_list | | []string | false |
|
||||
| allow_list | | []string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
## VMUserList
|
||||
|
||||
VMUserList contains a list of VMUser
|
||||
@@ -2024,6 +2057,7 @@ VMUserSpec defines the desired state of VMUser
|
||||
| generatePassword | GeneratePassword instructs operator to generate password for user if spec.password if empty. | bool | false |
|
||||
| bearerToken | BearerToken Authorization header value for accessing protected endpoint. | *string | false |
|
||||
| targetRefs | TargetRefs - reference to endpoints, which user may access. | [][TargetRef](#targetref) | true |
|
||||
| default_url | DefaultURLs backend url for non-matching paths filter usually used for default backend with error message | []string | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -2106,7 +2140,7 @@ VMAuthSpec defines the desired state of VMAuth
|
||||
| userNamespaceSelector | UserNamespaceSelector Namespaces to be selected for VMAuth discovery. Works in combination with Selector. NamespaceSelector nil - only objects at VMAuth namespace. Selector nil - only objects at NamespaceSelector namespaces. If both nil - behaviour controlled by selectAllByDefault | *[metav1.LabelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#labelselector-v1-meta) | false |
|
||||
| extraArgs | ExtraArgs that will be passed to VMAuth pod for example remoteWrite.tmpDataPath: /tmp | map[string]string | false |
|
||||
| extraEnvs | ExtraEnvs that will be added to VMAuth pod | [][v1.EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#envvar-v1-core) | false |
|
||||
| serviceSpec | ServiceSpec that will be added to vmauth service spec | *[ServiceSpec](#servicespec) | false |
|
||||
| serviceSpec | ServiceSpec that will be added to vmsingle service spec | *[ServiceSpec](#servicespec) | false |
|
||||
| serviceScrapeSpec | ServiceScrapeSpec that will be added to vmauth VMServiceScrape spec | *[VMServiceScrapeSpec](#vmservicescrapespec) | false |
|
||||
| podDisruptionBudget | PodDisruptionBudget created by operator | *[EmbeddedPodDisruptionBudgetSpec](#embeddedpoddisruptionbudgetspec) | false |
|
||||
| ingress | Ingress enables ingress configuration for VMAuth. | *[EmbeddedIngress](#embeddedingress) | false |
|
||||
@@ -2116,6 +2150,19 @@ VMAuthSpec defines the desired state of VMAuth
|
||||
| nodeSelector | NodeSelector Define which Nodes the Pods are scheduled on. | map[string]string | false |
|
||||
| terminationGracePeriodSeconds | TerminationGracePeriodSeconds period for container graceful termination | *int64 | false |
|
||||
| readinessGates | ReadinessGates defines pod readiness gates | []v1.PodReadinessGate | false |
|
||||
| unauthorizedAccessConfig | UnauthorizedAccessConfig configures access for un authorized users | [][VMAuthUnauthorizedPath](#vmauthunauthorizedpath) | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
## VMAuthUnauthorizedPath
|
||||
|
||||
VMAuthUnauthorizedPath defines url_map for unauthorized access
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
| src_paths | Paths src request paths | []string | false |
|
||||
| url_prefix | URLs defines url_prefix for dst routing | []string | false |
|
||||
| ip_filters | IPFilters defines filter for src ip address enterprise only | [VMUserIPFilters](#vmuseripfilters) | false |
|
||||
|
||||
[Back to TOC](#table-of-contents)
|
||||
|
||||
@@ -2200,7 +2247,7 @@ ProbeTargetIngress defines the set of Ingress objects considered for probing.
|
||||
|
||||
## VMProbe
|
||||
|
||||
\n VMProbe defines a probe for targets, that will be executed with prober,\n like blackbox exporter.\nIt helps to monitor reachability of target with various checks.
|
||||
VMProbe defines a probe for targets, that will be executed with prober, like blackbox exporter. It helps to monitor reachability of target with various checks.
|
||||
|
||||
| Field | Description | Scheme | Required |
|
||||
| ----- | ----------- | ------ | -------- |
|
||||
|
||||
@@ -9,8 +9,8 @@ menu:
|
||||
aliases:
|
||||
- /operator/vars.html
|
||||
---
|
||||
# Auto Generated vars for package config
|
||||
updated at Mon May 8 06:43:29 UTC 2023
|
||||
# Auto Generated vars for package config
|
||||
updated at Thu Aug 3 16:52:44 UTC 2023
|
||||
|
||||
|
||||
| varible name | variable default value | variable required | variable description |
|
||||
@@ -20,7 +20,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_CUSTOMCONFIGRELOADERIMAGE | victoriametrics/operator:config-reloader-v0.32.0 | false | - |
|
||||
| VM_PSPAUTOCREATEENABLED | true | false | - |
|
||||
| VM_VMALERTDEFAULT_IMAGE | victoriametrics/vmalert | false | - |
|
||||
| VM_VMALERTDEFAULT_VERSION | v1.89.1 | false | - |
|
||||
| VM_VMALERTDEFAULT_VERSION | v1.91.3 | false | - |
|
||||
| VM_VMALERTDEFAULT_PORT | 8080 | false | - |
|
||||
| VM_VMALERTDEFAULT_USEDEFAULTRESOURCES | true | false | - |
|
||||
| VM_VMALERTDEFAULT_RESOURCE_LIMIT_MEM | 500Mi | false | - |
|
||||
@@ -31,7 +31,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMALERTDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - |
|
||||
| VM_VMALERTDEFAULT_CONFIGRELOADIMAGE | jimmidyson/configmap-reload:v0.3.0 | false | - |
|
||||
| VM_VMAGENTDEFAULT_IMAGE | victoriametrics/vmagent | false | - |
|
||||
| VM_VMAGENTDEFAULT_VERSION | v1.89.1 | false | - |
|
||||
| VM_VMAGENTDEFAULT_VERSION | v1.91.3 | false | - |
|
||||
| VM_VMAGENTDEFAULT_CONFIGRELOADIMAGE | quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0 | false | - |
|
||||
| VM_VMAGENTDEFAULT_PORT | 8429 | false | - |
|
||||
| VM_VMAGENTDEFAULT_USEDEFAULTRESOURCES | true | false | - |
|
||||
@@ -42,7 +42,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMAGENTDEFAULT_CONFIGRELOADERCPU | 100m | false | - |
|
||||
| VM_VMAGENTDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - |
|
||||
| VM_VMSINGLEDEFAULT_IMAGE | victoriametrics/victoria-metrics | false | - |
|
||||
| VM_VMSINGLEDEFAULT_VERSION | v1.89.1 | false | - |
|
||||
| VM_VMSINGLEDEFAULT_VERSION | v1.91.3 | false | - |
|
||||
| VM_VMSINGLEDEFAULT_PORT | 8429 | false | - |
|
||||
| VM_VMSINGLEDEFAULT_USEDEFAULTRESOURCES | true | false | - |
|
||||
| VM_VMSINGLEDEFAULT_RESOURCE_LIMIT_MEM | 1500Mi | false | - |
|
||||
@@ -53,14 +53,14 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMSINGLEDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_USEDEFAULTRESOURCES | true | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_IMAGE | victoriametrics/vmselect | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_VERSION | v1.89.1-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_VERSION | v1.91.3-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_PORT | 8481 | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_LIMIT_MEM | 1000Mi | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_LIMIT_CPU | 500m | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_REQUEST_MEM | 500Mi | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_REQUEST_CPU | 100m | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_IMAGE | victoriametrics/vmstorage | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VERSION | v1.89.1-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VERSION | v1.91.3-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VMINSERTPORT | 8400 | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VMSELECTPORT | 8401 | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_PORT | 8482 | false | - |
|
||||
@@ -69,7 +69,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_RESOURCE_REQUEST_MEM | 500Mi | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_RESOURCE_REQUEST_CPU | 250m | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_IMAGE | victoriametrics/vminsert | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_VERSION | v1.89.1-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_VERSION | v1.91.3-cluster | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_PORT | 8480 | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_RESOURCE_LIMIT_MEM | 500Mi | false | - |
|
||||
| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_RESOURCE_LIMIT_CPU | 500m | false | - |
|
||||
@@ -88,7 +88,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMALERTMANAGER_RESOURCE_REQUEST_CPU | 30m | false | - |
|
||||
| VM_DISABLESELFSERVICESCRAPECREATION | false | false | - |
|
||||
| VM_VMBACKUP_IMAGE | victoriametrics/vmbackupmanager | false | - |
|
||||
| VM_VMBACKUP_VERSION | v1.89.1-enterprise | false | - |
|
||||
| VM_VMBACKUP_VERSION | v1.91.3-enterprise | false | - |
|
||||
| VM_VMBACKUP_PORT | 8300 | false | - |
|
||||
| VM_VMBACKUP_USEDEFAULTRESOURCES | true | false | - |
|
||||
| VM_VMBACKUP_RESOURCE_LIMIT_MEM | 500Mi | false | - |
|
||||
@@ -97,7 +97,7 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_VMBACKUP_RESOURCE_REQUEST_CPU | 150m | false | - |
|
||||
| VM_VMBACKUP_LOGLEVEL | INFO | false | - |
|
||||
| VM_VMAUTHDEFAULT_IMAGE | victoriametrics/vmauth | false | - |
|
||||
| VM_VMAUTHDEFAULT_VERSION | v1.89.1 | false | - |
|
||||
| VM_VMAUTHDEFAULT_VERSION | v1.91.3 | false | - |
|
||||
| VM_VMAUTHDEFAULT_CONFIGRELOADIMAGE | quay.io/prometheus-operator/prometheus-config-reloader:v0.48.1 | false | - |
|
||||
| VM_VMAUTHDEFAULT_PORT | 8427 | false | - |
|
||||
| VM_VMAUTHDEFAULT_USEDEFAULTRESOURCES | true | false | - |
|
||||
@@ -126,4 +126,5 @@ updated at Mon May 8 06:43:29 UTC 2023
|
||||
| VM_PODWAITREADYTIMEOUT | 80s | false | - |
|
||||
| VM_PODWAITREADYINTERVALCHECK | 5s | false | - |
|
||||
| VM_PODWAITREADYINITDELAY | 10s | false | - |
|
||||
| VM_FORCERESYNCINTERVAL | 60s | false | configures force resync interval for VMAgent, VMAlert and VMAlertmanager |
|
||||
| VM_FORCERESYNCINTERVAL | 60s | false | configures force resync interval for VMAgent, VMAlert, VMAlertmanager and VMAuth. |
|
||||
| VM_ENABLESTRICTSECURITY | true | false | EnableStrictSecurity will add default `securityContext` to pods and containers created by operatorDefault PodSecurityContext include:1. RunAsNonRoot: true2. RunAsUser/RunAsGroup/FSGroup: 65534'65534' refers to 'nobody' in all the used default images like alpine, busybox.If you're using customize image, please make sure '65534' is a valid uid in there or specify SecurityContext.Default container SecurityContext include:1. AllowPrivilegeEscalation: false2. ReadOnlyRootFilesystem: true |
|
||||
|
||||
@@ -1445,7 +1445,8 @@ scrape_configs:
|
||||
# If honor_timestamps is set to "false", the timestamps of the metrics exposed
|
||||
# by the target will be ignored.
|
||||
#
|
||||
# By default, honor_timestamps is set to true.
|
||||
# By default, honor_timestamps is set to false.
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1656540535 for details.
|
||||
# honor_timestamps: <boolean>
|
||||
|
||||
# scheme configures the protocol scheme used for requests.
|
||||
|
||||
@@ -83,7 +83,7 @@ Sometimes [alerting queries](https://docs.victoriametrics.com/vmalert.html#alert
|
||||
disk IO and network bandwidth at metrics storage side. For example, if `http_request_duration_seconds` histogram is generated by thousands
|
||||
of application instances, then the alerting query `histogram_quantile(0.99, sum(increase(http_request_duration_seconds_bucket[5m])) without (instance)) > 0.5`
|
||||
can become slow, since it needs to scan too big number of unique [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
|
||||
with `http_request_duration_seconds_bucket` name. This alerting query can be sped up by pre-calculating
|
||||
with `http_request_duration_seconds_bucket` name. This alerting query can be speed up by pre-calculating
|
||||
the `sum(increase(http_request_duration_seconds_bucket[5m])) without (instance)` via [recording rule](https://docs.victoriametrics.com/vmalert.html#recording-rules).
|
||||
But this recording rule may take too much time to execute too. In this case the slow recording rule can be substituted
|
||||
with the following [stream aggregation config](#stream-aggregation-config):
|
||||
|
||||
@@ -1582,7 +1582,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.sendTimeout array
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.shardByURL
|
||||
Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages
|
||||
|
||||
262
docs/vmalert.md
262
docs/vmalert.md
@@ -372,9 +372,9 @@ For recording rules to work `-remoteWrite.url` must be specified.
|
||||
|
||||
### Alerts state on restarts
|
||||
|
||||
`vmalert` is stateless, it holds alerts state in the process memory. Restarting of `vmalert` process
|
||||
will reset alerts state in memory. To prevent `vmalert` from losing alerts state it should be configured
|
||||
to persist the state to the remote destination via the following flags:
|
||||
`vmalert` holds alerts state in the memory. Restart of the `vmalert` process will reset the state of all active alerts
|
||||
in the memory. To prevent `vmalert` from losing the state on restarts configure it to persist the state
|
||||
to the remote database via the following flags:
|
||||
|
||||
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state
|
||||
to the configured address in the form of [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
|
||||
@@ -389,7 +389,7 @@ to persist the state to the remote destination via the following flags:
|
||||
Both flags are required for proper state restoration. Restore process may fail if time series are missing
|
||||
in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`)
|
||||
or received state doesn't match current `vmalert` rules configuration. `vmalert` marks successfully restored rules
|
||||
with `restored` label in [web UI](#WEB).
|
||||
with `restored` label in [web UI](#web).
|
||||
|
||||
### Multitenancy
|
||||
|
||||
@@ -530,7 +530,7 @@ Alertmanagers.
|
||||
|
||||
To avoid recording rules results and alerts state duplication in VictoriaMetrics server
|
||||
don't forget to configure [deduplication](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).
|
||||
The recommended value for `-dedup.minScrapeInterval` must be greater or equal to vmalert `evaluation_interval`.
|
||||
The recommended value for `-dedup.minScrapeInterval` must be multiple of vmalert's `evaluation_interval`.
|
||||
If you observe inconsistent or "jumping" values in series produced by vmalert, try disabling `-datasource.queryTimeAlignment`
|
||||
command line flag. Because of alignment, two or more vmalert HA pairs will produce results with the same timestamps.
|
||||
But due of backfilling (data delivered to the datasource with some delay) values of such results may differ,
|
||||
@@ -753,249 +753,6 @@ See full description for these flags in `./vmalert -help`.
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
> Unit testing is available from v1.92.0.
|
||||
> Unit tests do not respect `-clusterMode` for now.
|
||||
|
||||
You can use `vmalert` to run unit tests for alerting and recording rules.
|
||||
In unit test mode vmalert performs the following actions:
|
||||
* sets up an isolated VictoriaMetrics instance;
|
||||
* simulates the periodic ingestion of time series;
|
||||
* queries the ingested data for recording and alerting rules evaluation;
|
||||
* tests whether the firing alerts or resulting recording rules match the expected results.
|
||||
|
||||
See how to run vmalert in unit test mode below:
|
||||
```
|
||||
# Run vmalert with one or multiple test files via -unittestFile cmd-line flag
|
||||
./vmalert -unittestFile=test1.yaml -unittestFile=test2.yaml
|
||||
```
|
||||
|
||||
vmalert is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
|
||||
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert
|
||||
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
|
||||
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
|
||||
|
||||
### Test file format
|
||||
|
||||
The configuration format for files specified in `-unittestFile` cmd-line flag is the following:
|
||||
```
|
||||
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
|
||||
# Enterprise version of vmalert supports S3 and GCS paths to rules.
|
||||
rule_files:
|
||||
[ - <string> ]
|
||||
|
||||
# The evaluation interval for rules specified in `rule_files`
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
# Groups listed below will be evaluated by order.
|
||||
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
|
||||
group_eval_order:
|
||||
[ - <string> ]
|
||||
|
||||
# The list of unit test files to be checked during evaluation.
|
||||
tests:
|
||||
[ - <test_group> ]
|
||||
```
|
||||
|
||||
#### `<test_group>`
|
||||
|
||||
```
|
||||
# Interval between samples for input series
|
||||
interval: <duration>
|
||||
# Time series to persist into the database according to configured <interval> before running tests.
|
||||
input_series:
|
||||
[ - <series> ]
|
||||
|
||||
# Name of the test group, optional
|
||||
[ name: <string> ]
|
||||
|
||||
# Unit tests for alerting rules
|
||||
alert_rule_test:
|
||||
[ - <alert_test_case> ]
|
||||
|
||||
# Unit tests for Metricsql expressions.
|
||||
metricsql_expr_test:
|
||||
[ - <metricsql_expr_test> ]
|
||||
|
||||
# External labels accessible for templating.
|
||||
external_labels:
|
||||
[ <labelname>: <string> ... ]
|
||||
|
||||
```
|
||||
|
||||
#### `<series>`
|
||||
|
||||
```
|
||||
# series in the following format '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
series: <string>
|
||||
|
||||
# values support several special equations:
|
||||
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
|
||||
# Read this as series starts at a, then c further samples incrementing by b.
|
||||
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
|
||||
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
|
||||
# '_' represents a missing sample from scrape
|
||||
# 'stale' indicates a stale sample
|
||||
# Examples:
|
||||
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
|
||||
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
|
||||
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
|
||||
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
|
||||
values: <string>
|
||||
```
|
||||
|
||||
#### `<alert_test_case>`
|
||||
|
||||
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
|
||||
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
|
||||
but no need to add them under `exp_alerts`.
|
||||
You can also pass `--disableAlertgroupLabel` to prevent vmalert from adding `alertgroup` label.
|
||||
|
||||
```
|
||||
# The time elapsed from time=0s when this alerting rule should be checked.
|
||||
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
|
||||
eval_time: <duration>
|
||||
|
||||
# Name of the group name to be tested.
|
||||
groupname: <string>
|
||||
|
||||
# Name of the alert to be tested.
|
||||
alertname: <string>
|
||||
|
||||
# List of the expected alerts that are firing under the given alertname at
|
||||
# the given evaluation time. If you want to test if an alerting rule should
|
||||
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
|
||||
exp_alerts:
|
||||
[ - <alert> ]
|
||||
```
|
||||
|
||||
#### `<alert>`
|
||||
|
||||
```
|
||||
# These are the expanded labels and annotations of the expected alert.
|
||||
# Note: labels also include the labels of the sample associated with the alert
|
||||
exp_labels:
|
||||
[ <labelname>: <string> ]
|
||||
exp_annotations:
|
||||
[ <labelname>: <string> ]
|
||||
```
|
||||
|
||||
#### `<metricsql_expr_test>`
|
||||
|
||||
```
|
||||
# Expression to evaluate
|
||||
expr: <string>
|
||||
|
||||
# The time elapsed from time=0s when this expression be evaluated.
|
||||
eval_time: <duration>
|
||||
|
||||
# Expected samples at the given evaluation time.
|
||||
exp_samples:
|
||||
[ - <sample> ]
|
||||
```
|
||||
|
||||
#### `<sample>`
|
||||
|
||||
```
|
||||
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
labels: <string>
|
||||
|
||||
# The expected value of the Metricsql expression.
|
||||
value: <number>
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
This is an example input file for unit testing which will pass.
|
||||
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
|
||||
|
||||
With `rules.yaml` in the same directory, run `./vmalert -unittestFile=./unittest/testdata/test.yaml`.
|
||||
|
||||
#### `test.yaml`
|
||||
|
||||
```
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="prometheus", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: prometheus
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
```
|
||||
|
||||
#### `alerts.yaml`
|
||||
|
||||
```
|
||||
# This is the rules file.
|
||||
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
@@ -1497,8 +1254,6 @@ The shortlist of configuration flags is the following:
|
||||
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
|
||||
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-rule.configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
|
||||
-rule.maxResolveDuration duration
|
||||
Limits the maximum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group.
|
||||
-rule.resendDelay duration
|
||||
@@ -1546,11 +1301,6 @@ The shortlist of configuration flags is the following:
|
||||
Path to file with TLS key if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated
|
||||
-tlsMinVersion string
|
||||
Optional minimum TLS version to use for incoming requests over HTTPS if -tls is set. Supported values: TLS10, TLS11, TLS12, TLS13
|
||||
-unittestFile array
|
||||
Path to the unit test files. When set, vmalert starts in unit test mode and performs only tests on configured files.
|
||||
Examples:
|
||||
-unittestFile="./unittest/testdata/test1.yaml,./unittest/testdata/test2.yaml".
|
||||
See more information here https://docs.victoriametrics.com/vmalert.html#unit-testing-for-rules.
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
@@ -1618,7 +1368,7 @@ dns_sd_configs:
|
||||
port: 9093
|
||||
```
|
||||
|
||||
The list of configured or discovered Notifiers can be explored via [UI](#Web).
|
||||
The list of configured or discovered Notifiers can be explored via [UI](#web).
|
||||
If Alertmanager runs in cluster mode then all its URLs needs to be available during discovery
|
||||
to ensure [high availability](https://github.com/prometheus/alertmanager#high-availability).
|
||||
|
||||
|
||||
@@ -135,3 +135,59 @@ It is also possible to split up config into multiple files, just list them all i
|
||||
```sh
|
||||
python3 -m vmanomaly model_prophet.yaml io_csv.yaml scheduler_oneoff.yaml
|
||||
```
|
||||
|
||||
### Licensing
|
||||
|
||||
Starting from v1.5.0 vmanomaly requires a license key to run. You can obtain a trial license
|
||||
key [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
The license key can be passed via the following command-line flags:
|
||||
```
|
||||
--license LICENSE See https://victoriametrics.com/products/enterprise/
|
||||
for trial license
|
||||
--license-file LICENSE_FILE
|
||||
See https://victoriametrics.com/products/enterprise/
|
||||
for trial license
|
||||
--license-verify-offline LICENSE_VERIFY_OFFLINE
|
||||
Force offline verification of license code
|
||||
```
|
||||
|
||||
Usage example:
|
||||
```
|
||||
python3 -m vmanomaly --license-file /path/to/license_file.yaml config.yaml
|
||||
```
|
||||
|
||||
In order to make it easier to monitor the license expiration date, the following metrics are exposed:
|
||||
```
|
||||
# HELP vm_license_expires_at When the license expires as a Unix timestamp in seconds
|
||||
# TYPE vm_license_expires_at gauge
|
||||
vm_license_expires_at 1.6963776e+09
|
||||
# HELP vm_license_expires_in_seconds Amount of seconds until the license expires
|
||||
# TYPE vm_license_expires_in_seconds gauge
|
||||
vm_license_expires_in_seconds 4.886608e+06
|
||||
```
|
||||
|
||||
You can find example alerts for [vmalert](https://docs.victoriametrics.com/vmalert.html):
|
||||
```yaml
|
||||
groups:
|
||||
- name: vm-license
|
||||
# note the `job` filter and update accordingly to your setup
|
||||
rules:
|
||||
- alert: LicenseExpiresInLessThan30Days
|
||||
expr: vm_license_expires_in_seconds < 30 * 24 * 3600
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.job }} instance {{ $labels.instance }} license expires in less than 30 days"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} license expires in {{ $value | humanizeDuration }}.
|
||||
Please make sure to update the license before it expires."
|
||||
|
||||
- alert: LicenseExpiresInLessThan7Days
|
||||
expr: vm_license_expires_in_seconds < 7 * 24 * 3600
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.job }} instance {{ $labels.instance }} license expires in less than 7 days"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} license expires in {{ $value | humanizeDuration }}.
|
||||
Please make sure to update the license before it expires."
|
||||
```
|
||||
|
||||
@@ -362,6 +362,8 @@ See the docs at https://docs.victoriametrics.com/vmauth.html .
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
-failTimeout duration
|
||||
Sets a delay period for load balancing to skip a malfunctioning backend. (defaults 3s)
|
||||
-flagsAuthKey string
|
||||
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-fs.disableMmap
|
||||
|
||||
2
go.mod
2
go.mod
@@ -12,7 +12,7 @@ require (
|
||||
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
|
||||
github.com/VictoriaMetrics/fasthttp v1.2.0
|
||||
github.com/VictoriaMetrics/metrics v1.24.0
|
||||
github.com/VictoriaMetrics/metricsql v0.61.1
|
||||
github.com/VictoriaMetrics/metricsql v0.62.0
|
||||
github.com/aws/aws-sdk-go-v2 v1.19.0
|
||||
github.com/aws/aws-sdk-go-v2/config v1.18.29
|
||||
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.73
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user