mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-08 11:23:53 +03:00
Compare commits
420 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8da3f773ae | ||
|
|
9d5f5b6878 | ||
|
|
9a2ba5b6d1 | ||
|
|
b277ba8121 | ||
|
|
84a37098ed | ||
|
|
56ccfa5218 | ||
|
|
7c2c8b2981 | ||
|
|
d5dddb0953 | ||
|
|
586c5be404 | ||
|
|
1cd01b5359 | ||
|
|
88538df267 | ||
|
|
63e5ee0d29 | ||
|
|
eba4e92994 | ||
|
|
82ecfa3b32 | ||
|
|
dc4e3f0e0b | ||
|
|
8f2e88234f | ||
|
|
423825695f | ||
|
|
5dc0bf6d3d | ||
|
|
7eb171182b | ||
|
|
05d754d7bb | ||
|
|
8dec17470d | ||
|
|
5e35b87c3d | ||
|
|
c85d926569 | ||
|
|
f0cef4761b | ||
|
|
774f7ca1c1 | ||
|
|
a560b4788e | ||
|
|
8141541e61 | ||
|
|
e65b4cb6b1 | ||
|
|
7209d58fbd | ||
|
|
72c90bfd8b | ||
|
|
2a39ba639d | ||
|
|
8f0bcec6cc | ||
|
|
a13cd60c6f | ||
|
|
c970cb912c | ||
|
|
b5206ce33f | ||
|
|
4c7f216dfe | ||
|
|
530f7a21e8 | ||
|
|
7532dbcdf5 | ||
|
|
7ec6711f06 | ||
|
|
e149019c00 | ||
|
|
7bf2cbad32 | ||
|
|
6ff821c70d | ||
|
|
a43be95e83 | ||
|
|
f689164711 | ||
|
|
7976ec8bb1 | ||
|
|
9f3e3a4d7a | ||
|
|
57acbf5491 | ||
|
|
5820c0ffb7 | ||
|
|
ac3700ed1e | ||
|
|
b542e50680 | ||
|
|
818abca8f1 | ||
|
|
50c0d8c17d | ||
|
|
88e1b7d144 | ||
|
|
08495360b0 | ||
|
|
a12364ad37 | ||
|
|
e91d758831 | ||
|
|
3d63a79b91 | ||
|
|
e53ee763f9 | ||
|
|
ae1cc0fc4b | ||
|
|
e426434770 | ||
|
|
3e277020a5 | ||
|
|
ffa75c423d | ||
|
|
0bba630f55 | ||
|
|
2382053d32 | ||
|
|
69a647b0d2 | ||
|
|
f5dd2a71a6 | ||
|
|
4b98e436ef | ||
|
|
4e8d6b80e0 | ||
|
|
d120197676 | ||
|
|
4cb3af1a36 | ||
|
|
0d92abfbf6 | ||
|
|
ff1a725a56 | ||
|
|
05ae1472e3 | ||
|
|
4fd3f6f991 | ||
|
|
6281549f31 | ||
|
|
9f4e86ac2f | ||
|
|
af49c5bdf6 | ||
|
|
a47a05dfd2 | ||
|
|
3d4008263f | ||
|
|
72ff05255f | ||
|
|
a99d606220 | ||
|
|
f8692a1d43 | ||
|
|
78b28a03b6 | ||
|
|
854f40acf2 | ||
|
|
6d059b28bf | ||
|
|
f26b94cfb6 | ||
|
|
937338abdf | ||
|
|
f2b04f2efe | ||
|
|
ff624c9125 | ||
|
|
13b1358c07 | ||
|
|
f7ff809c1e | ||
|
|
ab6e994bab | ||
|
|
d2f30e8d79 | ||
|
|
270552fde4 | ||
|
|
32652485e3 | ||
|
|
d988f89415 | ||
|
|
3d5f48ec74 | ||
|
|
0ec43cb8b0 | ||
|
|
213c3903c9 | ||
|
|
a7797dae09 | ||
|
|
d186472081 | ||
|
|
7e2669f733 | ||
|
|
ff6d093e1b | ||
|
|
8311193293 | ||
|
|
80609fdf35 | ||
|
|
d7291487be | ||
|
|
f5a4731412 | ||
|
|
947009f459 | ||
|
|
4cf7238b73 | ||
|
|
c602284a99 | ||
|
|
2f35cf13c6 | ||
|
|
b4103e055a | ||
|
|
dde29c3c18 | ||
|
|
b3fcd726e3 | ||
|
|
5b6a9675d8 | ||
|
|
aa647637bf | ||
|
|
84860167d0 | ||
|
|
0101a2d7ca | ||
|
|
d2cab369b2 | ||
|
|
8905bc2a40 | ||
|
|
f9847352b4 | ||
|
|
d1a9d8aa1c | ||
|
|
b93b01bc6d | ||
|
|
dbd8beccfa | ||
|
|
6b23df2bec | ||
|
|
619d4959c7 | ||
|
|
70ea4e28a7 | ||
|
|
74a2943030 | ||
|
|
b3ec0fb5e2 | ||
|
|
9a3afea123 | ||
|
|
7705c19720 | ||
|
|
80c18b7275 | ||
|
|
cf87b810b7 | ||
|
|
538fdfe133 | ||
|
|
f52769f6ee | ||
|
|
e6a782498a | ||
|
|
a441cdd1d9 | ||
|
|
d0f08b4a58 | ||
|
|
aae8064285 | ||
|
|
7e173655ba | ||
|
|
92212f04da | ||
|
|
de60ad0cd6 | ||
|
|
7a8ef517ae | ||
|
|
d61bac9fd9 | ||
|
|
eac3da478e | ||
|
|
0890c780c2 | ||
|
|
fd1a6ce9ae | ||
|
|
9b90c841c6 | ||
|
|
93c87d28f6 | ||
|
|
23c55181ef | ||
|
|
b19ca3eb5f | ||
|
|
4e850cd6a7 | ||
|
|
8cb35974af | ||
|
|
a0380a0a91 | ||
|
|
3a68c47de0 | ||
|
|
697b6af10f | ||
|
|
93267f143f | ||
|
|
3412d5d138 | ||
|
|
27f7cca7ff | ||
|
|
cf38c3c62f | ||
|
|
c56b66210f | ||
|
|
82ffbcb9a6 | ||
|
|
82ccdfaa91 | ||
|
|
ab8f5545bc | ||
|
|
0eacea1de1 | ||
|
|
737d641920 | ||
|
|
4fc33163c4 | ||
|
|
f9f3afb6af | ||
|
|
8b32e7c3a0 | ||
|
|
1573ececb2 | ||
|
|
a249cd9d22 | ||
|
|
ef0e37cb9e | ||
|
|
0afd48d2ee | ||
|
|
42866fa754 | ||
|
|
827a3a7866 | ||
|
|
606585f7be | ||
|
|
21598ac417 | ||
|
|
894e5d2b9b | ||
|
|
415b1ddfb5 | ||
|
|
db7dd96346 | ||
|
|
ba48438b06 | ||
|
|
7882a0dbbf | ||
|
|
4fe67504f9 | ||
|
|
96e001d254 | ||
|
|
faf92a0965 | ||
|
|
a6f16dcc11 | ||
|
|
cc311e20fe | ||
|
|
574289c3fb | ||
|
|
0a134ace63 | ||
|
|
8300cc17af | ||
|
|
6273385618 | ||
|
|
3232605524 | ||
|
|
18834a9191 | ||
|
|
d90dc1fbf9 | ||
|
|
dbd0c552d5 | ||
|
|
cc00a2c453 | ||
|
|
ce2107bc52 | ||
|
|
12a1a71cc1 | ||
|
|
de113806bb | ||
|
|
8c8ff5d0cb | ||
|
|
9e8733ff65 | ||
|
|
5bc5d6a1f2 | ||
|
|
baedb25936 | ||
|
|
6909d845b0 | ||
|
|
efc8e3c523 | ||
|
|
51291015a5 | ||
|
|
099e44005b | ||
|
|
787fcfba0c | ||
|
|
6afb25fd08 | ||
|
|
653d51694a | ||
|
|
91a49eecea | ||
|
|
c4c447507d | ||
|
|
8a00807f60 | ||
|
|
b69eb7bf38 | ||
|
|
68928bf3df | ||
|
|
e8936c9cb3 | ||
|
|
3f52a97f9b | ||
|
|
364789c24c | ||
|
|
08320cfcf4 | ||
|
|
f65930b34d | ||
|
|
266327642b | ||
|
|
0c7cddfca6 | ||
|
|
e767aedd17 | ||
|
|
b5a780930d | ||
|
|
7b5ef63384 | ||
|
|
1aea001532 | ||
|
|
4fa817be10 | ||
|
|
8c77faec96 | ||
|
|
0ba1b5c71b | ||
|
|
40c3ffb359 | ||
|
|
83f0e35b7b | ||
|
|
218e566647 | ||
|
|
6310b20e72 | ||
|
|
d17381037e | ||
|
|
6c68b8aa81 | ||
|
|
23010e6321 | ||
|
|
66b0ae79a5 | ||
|
|
69004a5f67 | ||
|
|
658a8742ac | ||
|
|
e4aac6ea40 | ||
|
|
d9f1b4d6a3 | ||
|
|
3b234d82e5 | ||
|
|
d2af2c8c3e | ||
|
|
ee810e5f3a | ||
|
|
34743974d5 | ||
|
|
09f5d0056f | ||
|
|
432187ac3b | ||
|
|
825a2dd554 | ||
|
|
67511d4165 | ||
|
|
01c17092e1 | ||
|
|
7d36616b93 | ||
|
|
d0ebbb166e | ||
|
|
8b2f54d7cd | ||
|
|
5ec036439d | ||
|
|
43c39dc36c | ||
|
|
cc1878607a | ||
|
|
d8cd69895c | ||
|
|
4487b454a8 | ||
|
|
e3cc329d85 | ||
|
|
57407cca83 | ||
|
|
4470308d5b | ||
|
|
4e4f57b121 | ||
|
|
17d96e4503 | ||
|
|
83aca79137 | ||
|
|
1397612117 | ||
|
|
20b71acf19 | ||
|
|
521df0e2fc | ||
|
|
2b16c188e8 | ||
|
|
3bfa41a95c | ||
|
|
90670cb55e | ||
|
|
303905cd84 | ||
|
|
36fa3078c2 | ||
|
|
95942f1ac6 | ||
|
|
b768bc9a6a | ||
|
|
de59703a16 | ||
|
|
b4afe562c1 | ||
|
|
0224071ebe | ||
|
|
fcf57f9883 | ||
|
|
6954d0edb7 | ||
|
|
fb967ae6c8 | ||
|
|
2c18548e08 | ||
|
|
5f61d43db9 | ||
|
|
eeadfccdc5 | ||
|
|
d7c1ff8b0c | ||
|
|
1f3fd93b58 | ||
|
|
66af7e40f3 | ||
|
|
491b31b369 | ||
|
|
4b84c592e9 | ||
|
|
a596aec82c | ||
|
|
7b8008e0bd | ||
|
|
6d3567d65c | ||
|
|
9ef5935552 | ||
|
|
b80e6b4d56 | ||
|
|
5f9c23226a | ||
|
|
ac43075cc9 | ||
|
|
3157fb0186 | ||
|
|
e48822942d | ||
|
|
77bea69fab | ||
|
|
24461153bf | ||
|
|
00e897119f | ||
|
|
a9a7a7175e | ||
|
|
a9b83bf512 | ||
|
|
a87ca3bdf0 | ||
|
|
1c5d14a2eb | ||
|
|
a714568374 | ||
|
|
364db13c9c | ||
|
|
01e33be34a | ||
|
|
78ff5f2aa5 | ||
|
|
2dc5593b75 | ||
|
|
9ebc937685 | ||
|
|
fe57d46687 | ||
|
|
6cc6ec6d2e | ||
|
|
5454b518a6 | ||
|
|
5ecb50d7c2 | ||
|
|
851946af1e | ||
|
|
2de76bca96 | ||
|
|
94ad531bfe | ||
|
|
936fb0eac3 | ||
|
|
43375df923 | ||
|
|
43bbffebb3 | ||
|
|
79fb595732 | ||
|
|
546d26523c | ||
|
|
f41e6a7bd9 | ||
|
|
830538e290 | ||
|
|
5d1537a395 | ||
|
|
600490131f | ||
|
|
bd4c6d21dd | ||
|
|
95da8d410c | ||
|
|
bcec5c5429 | ||
|
|
467279acd2 | ||
|
|
e0d213f82b | ||
|
|
2fd2dec5eb | ||
|
|
071fdf5518 | ||
|
|
30b401ebbf | ||
|
|
a59a7bcc5e | ||
|
|
ccb887c0f6 | ||
|
|
6f7f64f757 | ||
|
|
426a0567c4 | ||
|
|
6e2f6574b8 | ||
|
|
c1de3f67b4 | ||
|
|
8a25c1ed71 | ||
|
|
067c7afebc | ||
|
|
ac35635b71 | ||
|
|
78863d7066 | ||
|
|
c64f003cfb | ||
|
|
4718a5d951 | ||
|
|
257521a634 | ||
|
|
6a75c95194 | ||
|
|
01d7d799dc | ||
|
|
0b76c27fa1 | ||
|
|
2e4e202c2b | ||
|
|
2814b1490f | ||
|
|
90b4a6dd12 | ||
|
|
2eed6c393f | ||
|
|
948f8b6b5f | ||
|
|
8fca5f2819 | ||
|
|
7c9405f53d | ||
|
|
9f8cc8ae1b | ||
|
|
90de3086b3 | ||
|
|
830d5fb1e0 | ||
|
|
66d8086a5e | ||
|
|
a30c98c0bc | ||
|
|
4de6c6bbf0 | ||
|
|
ded0c0d3c7 | ||
|
|
7d73623c69 | ||
|
|
e62afc7366 | ||
|
|
0681b4c27a | ||
|
|
f86947d55c | ||
|
|
f94a090020 | ||
|
|
8064775c02 | ||
|
|
520a704606 | ||
|
|
105f0c78d9 | ||
|
|
b099d84271 | ||
|
|
407bdbf2b9 | ||
|
|
69962a7001 | ||
|
|
9f03548e55 | ||
|
|
022310f35b | ||
|
|
895cadfae7 | ||
|
|
57704aa584 | ||
|
|
f9b24d4899 | ||
|
|
fa0554b771 | ||
|
|
35b133bff4 | ||
|
|
a884803377 | ||
|
|
b38d048dd9 | ||
|
|
de2cd4231b | ||
|
|
298eb0a0f8 | ||
|
|
12fe915b48 | ||
|
|
cdf0a4cf8f | ||
|
|
1c9c57db1c | ||
|
|
8edc72201d | ||
|
|
b024ecd10c | ||
|
|
e0d0348f36 | ||
|
|
3e55c7e069 | ||
|
|
c4acd20d2a | ||
|
|
8661dc4624 | ||
|
|
16572c8722 | ||
|
|
b699c46046 | ||
|
|
e71519b8b2 | ||
|
|
972713bd79 | ||
|
|
5d99ca6cfc | ||
|
|
318326c309 | ||
|
|
a1e4c6a2be | ||
|
|
ac3ee44fa7 | ||
|
|
b98ca56d94 | ||
|
|
b41ee5f27d | ||
|
|
8d35af6fdb | ||
|
|
0f2dd77a76 | ||
|
|
0c485f14d1 | ||
|
|
2ebf7d86ff | ||
|
|
bf6c24d0f4 | ||
|
|
1f7292675a | ||
|
|
bd156cd088 | ||
|
|
b695087119 | ||
|
|
80f53e5396 | ||
|
|
7acb797595 | ||
|
|
3a8bbfd6b9 | ||
|
|
27373807c1 | ||
|
|
8d7f0aa632 | ||
|
|
149f365f74 | ||
|
|
b22da547a2 |
8
.github/workflows/github-pages.yml
vendored
8
.github/workflows/github-pages.yml
vendored
@@ -2,7 +2,7 @@ name: github-pages
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'docs/*.md'
|
||||
- 'docs/*'
|
||||
- 'README.md'
|
||||
branches:
|
||||
- master
|
||||
@@ -17,14 +17,14 @@ jobs:
|
||||
TOKEN: ${{secrets.CI_TOKEN}}
|
||||
run: |
|
||||
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git gpages
|
||||
cp docs/*.md gpages
|
||||
cp docs/* gpages
|
||||
cp README.md gpages
|
||||
cd gpages
|
||||
git config --local user.email "info@victoriametrics.com"
|
||||
git config --local user.name "Vika"
|
||||
git add "*.md"
|
||||
git add .
|
||||
git commit -m "update github pages"
|
||||
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git"
|
||||
git push "${remote_repo}"
|
||||
cd ..
|
||||
rm -rf gpages
|
||||
rm -rf gpages
|
||||
|
||||
13
.github/workflows/main.yml
vendored
13
.github/workflows/main.yml
vendored
@@ -14,18 +14,19 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v1
|
||||
uses: actions/setup-go@master
|
||||
with:
|
||||
go-version: 1.13
|
||||
go-version: 1.14
|
||||
id: go
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@v1
|
||||
- name: Dependencies
|
||||
env:
|
||||
GO111MODULE: off
|
||||
GO111MODULE: on
|
||||
run: |
|
||||
go get -v golang.org/x/lint/golint
|
||||
go get -u golang.org/x/lint/golint
|
||||
go get -u github.com/kisielk/errcheck
|
||||
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.27.0
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
- name: Build
|
||||
env:
|
||||
GO111MODULE: on
|
||||
|
||||
9
.github/workflows/wiki.yml
vendored
9
.github/workflows/wiki.yml
vendored
@@ -2,7 +2,7 @@ name: wiki
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'docs/*.md'
|
||||
- 'docs/*'
|
||||
branches:
|
||||
- master
|
||||
jobs:
|
||||
@@ -15,15 +15,14 @@ jobs:
|
||||
env:
|
||||
TOKEN: ${{secrets.CI_TOKEN}}
|
||||
run: |
|
||||
cd docs
|
||||
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git wiki
|
||||
find ./ -name '*.md' -exec cp -prv '{}' 'wiki' ';'
|
||||
cp docs/* wiki
|
||||
cd wiki
|
||||
git config --local user.email "info@victoriametrics.com"
|
||||
git config --local user.name "Vika"
|
||||
git add "*.md"
|
||||
git add .
|
||||
git commit -m "update wiki pages"
|
||||
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git"
|
||||
git push "${remote_repo}"
|
||||
cd ..
|
||||
rm -rf wiki
|
||||
rm -rf wiki
|
||||
|
||||
25
Makefile
25
Makefile
@@ -13,6 +13,8 @@ GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date
|
||||
all: \
|
||||
victoria-metrics-prod \
|
||||
vmagent-prod \
|
||||
vmalert-prod \
|
||||
vmauth-prod \
|
||||
vmbackup-prod \
|
||||
vmrestore-prod
|
||||
|
||||
@@ -25,17 +27,23 @@ clean:
|
||||
publish: \
|
||||
publish-victoria-metrics \
|
||||
publish-vmagent \
|
||||
publish-vmalert \
|
||||
publish-vmauth \
|
||||
publish-vmbackup \
|
||||
publish-vmrestore
|
||||
|
||||
package: \
|
||||
package-victoria-metrics \
|
||||
package-vmagent \
|
||||
package-vmalert \
|
||||
package-vmauth \
|
||||
package-vmbackup \
|
||||
package-vmrestore
|
||||
|
||||
vmutils: \
|
||||
vmagent \
|
||||
vmalert \
|
||||
vmauth \
|
||||
vmbackup \
|
||||
vmrestore
|
||||
|
||||
@@ -49,9 +57,11 @@ release-victoria-metrics: victoria-metrics-prod
|
||||
|
||||
release-vmutils: \
|
||||
vmagent-prod \
|
||||
vmalert-prod \
|
||||
vmauth-prod \
|
||||
vmbackup-prod \
|
||||
vmrestore-prod
|
||||
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmbackup-prod vmrestore-prod && \
|
||||
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmalert-prod vmauth-prod vmbackup-prod vmrestore-prod && \
|
||||
sha256sum vmutils-$(PKG_TAG).tar.gz > vmutils-$(PKG_TAG)_checksums.txt
|
||||
|
||||
pprof-cpu:
|
||||
@@ -78,9 +88,10 @@ errcheck: install-errcheck
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmselect/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmstorage/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmagent/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmalert/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmauth/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmbackup/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmrestore/...
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmalert/...
|
||||
|
||||
install-errcheck:
|
||||
which errcheck || GO111MODULE=off go get -u github.com/kisielk/errcheck
|
||||
@@ -130,7 +141,15 @@ install-qtc:
|
||||
|
||||
|
||||
golangci-lint: install-golangci-lint
|
||||
golangci-lint run --exclude '(SA4003|SA1019):' -D errcheck -D structcheck
|
||||
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
|
||||
|
||||
docs-sync:
|
||||
cp app/vmagent/README.md docs/vmagent.md
|
||||
cp app/vmalert/README.md docs/vmalert.md
|
||||
cp app/vmauth/README.md docs/vmauth.md
|
||||
cp app/vmbackup/README.md docs/vmbackup.md
|
||||
cp app/vmrestore/README.md docs/vmrestore.md
|
||||
cp README.md docs/Single-server-VictoriaMetrics.md
|
||||
|
||||
231
README.md
231
README.md
@@ -10,26 +10,40 @@
|
||||
|
||||
## VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
|
||||
VictoriaMetrics is fast, cost-effective and scalable time-series database.
|
||||
|
||||
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
|
||||
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
|
||||
|
||||
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
See our [Wiki](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki) for additional documentation.
|
||||
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
|
||||
See [features available for enterprise customers](https://github.com/VictoriaMetrics/VictoriaMetrics/issues?q=is%3Aissue+label%3Aenterprise).
|
||||
|
||||
|
||||
## Case studies and talks
|
||||
|
||||
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
|
||||
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
|
||||
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
|
||||
* [Zerodha](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#zerodha)
|
||||
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
|
||||
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
|
||||
* [Synthesio](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#synthesio)
|
||||
* [MHI Vestas Offshore Wind](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#mhi-vestas-offshore-wind)
|
||||
* [Dreamteam](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#dreamteam)
|
||||
* [Brandwatch](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#brandwatch)
|
||||
* [Adsterra](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adsterra)
|
||||
* [ARNES](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#arnes)
|
||||
|
||||
|
||||
## Prominent features
|
||||
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
|
||||
See [these docs](#prometheus-setup) for details.
|
||||
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
|
||||
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
|
||||
@@ -112,13 +126,15 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
|
||||
* [Monitoring](#monitoring)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Backfilling](#backfilling)
|
||||
* [Replication](#replication)
|
||||
* [Backups](#backups)
|
||||
* [Profiling](#profiling)
|
||||
* [Integrations](#integrations)
|
||||
* [Roadmap](#roadmap)
|
||||
* [Third-party contributions](#third-party-contributions)
|
||||
* [Contacts](#contacts)
|
||||
* [Community and contributions](#community-and-contributions)
|
||||
* [Third-party contributions](#third-party-contributions)
|
||||
* [Reporting bugs](#reporting-bugs)
|
||||
* [Roadmap](#roadmap)
|
||||
* [Victoria Metrics Logo](#victoria-metrics-logo)
|
||||
* [Logo Usage Guidelines](#logo-usage-guidelines)
|
||||
* [Font used](#font-used)
|
||||
@@ -134,7 +150,9 @@ The following command-line flags are used the most:
|
||||
|
||||
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in current working directory.
|
||||
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
|
||||
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this.
|
||||
VictoriaMetrics accepts [Prometheus querying API requests](#prometheus-querying-api-usage) on port `8428` by default.
|
||||
|
||||
Pass `-help` to see all the available flags with description and default values.
|
||||
|
||||
@@ -147,6 +165,7 @@ Each flag values can be set thru environment variables by following these rules:
|
||||
* The `-envflag.enable` flag must be set
|
||||
* Each `.` in flag names must be substituted by `_` (for example `-insert.maxQueueDuration <duration>` will translate to `insert_maxQueueDuration=<duration>`)
|
||||
* For repeating flags, an alternative syntax can be used by joining the different values into one using `,` as separator (for example `-storageNode <nodeA> -storageNode <nodeB>` will translate to `storageNode=<nodeA>,<nodeB>`)
|
||||
* It is possible setting prefix for environment vars with `-envflag.prefix`. For instance, if `-envflag.prefix=VM_`, then env vars must be prepended with `VM_`
|
||||
|
||||
### Prometheus setup
|
||||
|
||||
@@ -204,6 +223,10 @@ Read more about tuning remote write for Prometheus [here](https://prometheus.io/
|
||||
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer,
|
||||
since the previous versions may have issues with `remote_write`.
|
||||
|
||||
Take a look also at [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md),
|
||||
which can be used as faster and less resource-hungry alternative to Prometheus in certain cases.
|
||||
|
||||
|
||||
### Grafana setup
|
||||
|
||||
Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following Url:
|
||||
@@ -251,6 +274,11 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la
|
||||
|
||||
* [static_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config)
|
||||
* [file_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config)
|
||||
* [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config)
|
||||
* [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config)
|
||||
* [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config)
|
||||
* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
|
||||
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
|
||||
|
||||
In the future other `*_sd_config` types will be supported.
|
||||
|
||||
@@ -268,7 +296,8 @@ For instance, put the following lines into `Telegraf` config, so it sends data t
|
||||
|
||||
Do not forget substituting `<victoriametrics-addr>` with the real address where VictoriaMetrics runs.
|
||||
|
||||
Another option is to enable TCP and UDP receiver for Influx line protocol via `-influxListenAddr` command-line flag.
|
||||
Another option is to enable TCP and UDP receiver for Influx line protocol via `-influxListenAddr` command-line flag
|
||||
and stream plain Influx line protocol data to the configured TCP and/or UDP addresses.
|
||||
|
||||
VictoriaMetrics maps Influx data using the following rules:
|
||||
|
||||
@@ -300,7 +329,7 @@ to local VictoriaMetrics using `curl`:
|
||||
curl -d 'measurement,tag1=value1,tag2=value2 field1=123,field2=1.23' -X POST 'http://localhost:8428/write'
|
||||
```
|
||||
|
||||
An arbitrary number of lines delimited by '\n' may be sent in a single request.
|
||||
An arbitrary number of lines delimited by '\n' (aka newline char) may be sent in a single request.
|
||||
After that the data may be read via [/api/v1/export](#how-to-export-time-series) endpoint:
|
||||
|
||||
```bash
|
||||
@@ -336,7 +365,7 @@ echo "foo.bar.baz;tag1=value1;tag2=value2 123 `date +%s`" | nc -N localhost 2003
|
||||
```
|
||||
|
||||
VictoriaMetrics sets the current time if the timestamp is omitted.
|
||||
An arbitrary number of lines delimited by `\n` may be sent in one go.
|
||||
An arbitrary number of lines delimited by `\n` (aka newline char) may be sent in one go.
|
||||
After that the data may be read via [/api/v1/export](#how-to-export-time-series) endpoint:
|
||||
|
||||
```bash
|
||||
@@ -377,7 +406,7 @@ Example for writing data with OpenTSDB protocol to local VictoriaMetrics using `
|
||||
echo "put foo.bar.baz `date +%s` 123 tag1=value1 tag2=value2" | nc -N localhost 4242
|
||||
```
|
||||
|
||||
An arbitrary number of lines delimited by `\n` may be sent in one go.
|
||||
An arbitrary number of lines delimited by `\n` (aka newline char) may be sent in one go.
|
||||
After that the data may be read via [/api/v1/export](#how-to-export-time-series) endpoint:
|
||||
|
||||
```bash
|
||||
@@ -439,10 +468,10 @@ The `format` query arg must contain comma-separated list of parsing rules for CS
|
||||
|
||||
* `<column_pos>` is the position of the CSV column (field). Column numbering starts from 1. The order of parsing rules may be arbitrary.
|
||||
* `<type>` describes the column type. Supported types are:
|
||||
* `metric` - the corresponding CSV column at `<column_pos>` contains metric value. The metric name is read from the `<context>`.
|
||||
CSV line must have at least a single metric field.
|
||||
* `metric` - the corresponding CSV column at `<column_pos>` contains metric value, which must be integer or floating-point number.
|
||||
The metric name is read from the `<context>`. CSV line must have at least a single metric field. Multiple metric fields per CSV line is OK.
|
||||
* `label` - the corresponding CSV column at `<column_pos>` contains label value. The label name is read from the `<context>`.
|
||||
CSV line may have arbitrary number of label fields. All these fields are attached to all the configured metrics.
|
||||
CSV line may have arbitrary number of label fields. All these labels are attached to all the configured metrics.
|
||||
* `time` - the corresponding CSV column at `<column_pos>` contains metric time. CSV line may contain either one or zero columns with time.
|
||||
If CSV line has no time, then the current time is used. The time is applied to all the configured metrics.
|
||||
The format of the time is configured via `<context>`. Supported time formats are:
|
||||
@@ -452,7 +481,7 @@ The `format` query arg must contain comma-separated list of parsing rules for CS
|
||||
* `rfc3339` - timestamp in [RFC3339](https://tools.ietf.org/html/rfc3339) format, i.e. `2006-01-02T15:04:05Z`.
|
||||
* `custom:<layout>` - custom layout for the timestamp. The `<layout>` may contain arbitrary time layout according to [time.Parse rules in Go](https://golang.org/pkg/time/#Parse).
|
||||
|
||||
Each request to `/api/v1/import/csv` can contain arbitrary number of CSV lines.
|
||||
Each request to `/api/v1/import/csv` may contain arbitrary number of CSV lines.
|
||||
|
||||
Example for importing CSV data via `/api/v1/import/csv`:
|
||||
|
||||
@@ -475,6 +504,8 @@ The following response should be returned:
|
||||
{"metric":{"__name__":"ask","market":"NYSE","ticker":"GOOG"},"values":[1.23],"timestamps":[1583865146495]}
|
||||
```
|
||||
|
||||
Note that it could be required to flush response cache after importing historical data. See [these docs](#backfilling) for detail.
|
||||
|
||||
|
||||
### Prometheus querying API usage
|
||||
|
||||
@@ -485,6 +516,7 @@ VictoriaMetrics supports the following handlers from [Prometheus querying API](h
|
||||
* [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
|
||||
* [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
|
||||
* [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
|
||||
* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats)
|
||||
|
||||
These handlers can be queried from Prometheus-compatible clients such as Grafana or curl.
|
||||
|
||||
@@ -505,11 +537,11 @@ Additionally VictoriaMetrics provides the following handlers:
|
||||
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) instead of building VictoriaMetrics
|
||||
from sources. Building from sources is reasonable when developing additional features specific
|
||||
to your needs.
|
||||
to your needs or when testing bugfixes.
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics` from the root folder of the repository.
|
||||
It builds `victoria-metrics` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -525,7 +557,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
#### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics-arm` or `make victoria-metrics-arm64` from the root folder of the repository.
|
||||
It builds `victoria-metrics-arm` or `victoria-metrics-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
@@ -541,7 +573,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
This is an experimental mode, which may result in a lower compression ratio and slower decompression performance.
|
||||
Use it with caution!
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics-pure` from the root folder of the repository.
|
||||
It builds `victoria-metrics-pure` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -551,10 +583,18 @@ Run `make package-victoria-metrics`. It builds `victoriametrics/victoria-metrics
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-victoria-metrics`.
|
||||
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image for improved debuggability.
|
||||
It is possible to build the package on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
|
||||
For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
### Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
helps to spin up VictoriaMetrics, Prometheus and Grafana with one command.
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) and Grafana with one command.
|
||||
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
|
||||
|
||||
### Setting up service
|
||||
@@ -595,11 +635,13 @@ Steps for restoring from a snapshot:
|
||||
Send a request to `http://<victoriametrics-addr>:8428/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`,
|
||||
where `<timeseries_selector_for_delete>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
|
||||
for metrics to delete. After that all the time series matching the given selector are deleted. Storage space for
|
||||
the deleted time series isn't freed instantly - it is freed during subsequent merges of data files.
|
||||
the deleted time series isn't freed instantly - it is freed during subsequent [background merges of data files](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
|
||||
It is recommended verifying which metrics will be deleted with the call to `http://<victoria-metrics-addr>:8428/api/v1/series?match[]=<timeseries_selector_for_delete>`
|
||||
before actually deleting the metrics.
|
||||
|
||||
The `/api/v1/admin/tsdb/delete_series` handler may be protected with `authKey` if `-deleteAuthKey` command-line flag is set.
|
||||
|
||||
The delete API is intended mainly for the following cases:
|
||||
|
||||
* One-off deleting of accidentally written invalid (or undesired) time series.
|
||||
@@ -607,10 +649,11 @@ The delete API is intended mainly for the following cases:
|
||||
|
||||
It isn't recommended using delete API for the following cases, since it brings non-zero overhead:
|
||||
|
||||
* Regular cleanups for unneded data. Just prevent writing unneeded data into VictoriaMetrics.
|
||||
* Regular cleanups for unneeded data. Just prevent writing unneeded data into VictoriaMetrics.
|
||||
This can be done with relabeling in [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
|
||||
See [this article](https://www.robustperception.io/relabelling-can-discard-targets-timeseries-and-alerts) for details.
|
||||
* Reducing disk space usage by deleting unneded time series. This doesn't work as expected, since the deleted
|
||||
time series occupy disk space until the next merge operation, which can never occur.
|
||||
* Reducing disk space usage by deleting unneeded time series. This doesn't work as expected, since the deleted
|
||||
time series occupy disk space until the next merge operation, which can never occur when deleting too old data.
|
||||
|
||||
It is better using `-retentionPeriod` command-line flag for efficient pruning of old data.
|
||||
|
||||
@@ -676,6 +719,8 @@ curl -H 'Accept-Encoding: gzip' http://source-victoriametrics:8428/api/v1/export
|
||||
curl -X POST -H 'Content-Encoding: gzip' http://destination-victoriametrics:8428/api/v1/import -T exported_data.jsonl.gz
|
||||
```
|
||||
|
||||
Note that it could be required to flush response cache after importing historical data. See [these docs](#backfilling) for detail.
|
||||
|
||||
Each request to `/api/v1/import` can load up to a single vCPU core on VictoriaMetrics. Import speed can be improved by splitting the original file into smaller parts
|
||||
and importing them concurrently. Note that the original file must be split on newlines.
|
||||
|
||||
@@ -731,7 +776,13 @@ The required resources for query path:
|
||||
### High availability
|
||||
|
||||
1) Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
2) Add addresses of these instances to `remote_write` section in Prometheus config:
|
||||
2) Pass addresses of these instances to [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
@@ -750,6 +801,8 @@ remote_write:
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
|
||||
|
||||
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
6) Set up Prometheus datasource in Grafana that points to Promxy.
|
||||
@@ -760,6 +813,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
|
||||
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
|
||||
with the enabled de-duplication. See [this section](#deduplication) for details.
|
||||
|
||||
|
||||
### Deduplication
|
||||
|
||||
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
|
||||
@@ -777,6 +831,8 @@ Data is split in per-month subdirectories inside `<-storageDataPath>/data/small`
|
||||
Directories for months outside the configured retention are deleted on the first day of new month.
|
||||
In order to keep data according to `-retentionPeriod` max disk space usage is going to be `-retentionPeriod` + 1 month.
|
||||
For example if `-retentionPeriod` is set to 1, data for January is deleted on March 1st.
|
||||
It is safe to extend `-retentionPeriod` on existing data. If `-retentionPeriod` is set to lower
|
||||
value than before then data outside the configured period will be eventually deleted.
|
||||
|
||||
### Multiple retentions
|
||||
|
||||
@@ -786,6 +842,11 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
|
||||
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
|
||||
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
|
||||
|
||||
Then set up [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md) in front of VictoriaMetrics instances,
|
||||
so it could route requests from particular user to VictoriaMetrics with the desired retention.
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
|
||||
|
||||
|
||||
### Downsampling
|
||||
|
||||
There is no downsampling support at the moment, but:
|
||||
@@ -798,6 +859,10 @@ There is no downsampling support at the moment, but:
|
||||
These properties reduce the need of downsampling. We plan to implement downsampling in the future.
|
||||
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/36) for details.
|
||||
|
||||
It is possible to (ab)use [-dedup.minScrapeInterval](#deduplication) for basic downsampling.
|
||||
For instance, if interval between the ingested data points is 15s, then `-dedup.minScrapeInterval=5m` will leave
|
||||
only a single data point out of 20 initial data points per each 5m interval.
|
||||
|
||||
### Multi-tenancy
|
||||
|
||||
Single-node VictoriaMetrics doesn't support multi-tenancy. Use [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster) instead.
|
||||
@@ -814,11 +879,14 @@ horizontally scalable long-term remote storage for really large Prometheus deplo
|
||||
|
||||
### Alerting
|
||||
|
||||
VictoriaMetrics doesn't support rule evaluation and alerting yet, so these actions can be performed at the following places:
|
||||
It is recommended using [vmalert](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md) for alerting.
|
||||
|
||||
Additionally, alerting can be set up with the following tools:
|
||||
|
||||
* With Prometheus - see [the corresponding docs](https://prometheus.io/docs/alerting/overview/).
|
||||
* With Promxy - see [the corresponding docs](https://github.com/jacksontj/promxy/blob/master/README.md#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
* With Grafana - see [the corresponding docs](https://grafana.com/docs/alerting/rules/).
|
||||
|
||||
* At Prometheus - see [the corresponding docs](https://prometheus.io/docs/alerting/overview/).
|
||||
* At Promxy - see [the corresponding docs](https://github.com/jacksontj/promxy/blob/master/README.md#how-do-i-use-alertingrecording-rules-in-promxy).
|
||||
* At Grafana - see [the corresponding docs](https://grafana.com/docs/alerting/rules/).
|
||||
|
||||
### Security
|
||||
|
||||
@@ -830,10 +898,15 @@ Consider setting the following command-line flags:
|
||||
with [HTTP Basic Authentication](https://en.wikipedia.org/wiki/Basic_access_authentication).
|
||||
* `-deleteAuthKey` for protecting `/api/v1/admin/tsdb/delete_series` endpoint. See [how to delete time series](#how-to-delete-time-series).
|
||||
* `-snapshotAuthKey` for protecting `/snapshot*` endpoints. See [how to work with snapshots](#how-to-work-with-snapshots).
|
||||
* `-search.resetCacheAuthKey` for protecting `/internal/resetRollupResultCache` endpoint. See [backfilling](#backfilling) for more details.
|
||||
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
|
||||
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md)
|
||||
or similar auth proxy.
|
||||
|
||||
|
||||
### Tuning
|
||||
|
||||
* There is no need for VictoriaMetrics tuning since it uses reasonable defaults for command-line flags,
|
||||
@@ -853,7 +926,8 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
|
||||
### Monitoring
|
||||
|
||||
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
|
||||
These metrics may be collected via Prometheus by adding the corresponding scrape config to it.
|
||||
These metrics may be collected by [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md)
|
||||
or Prometheus by adding the corresponding scrape config to it.
|
||||
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
|
||||
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
|
||||
|
||||
@@ -864,30 +938,40 @@ The most interesting metrics are:
|
||||
|
||||
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
|
||||
aka active time series.
|
||||
* `rate(vm_new_timeseries_created_total[5m])` - time series churn rate.
|
||||
* `vm_rows{type="indexdb"}` - the number of rows in inverted index. High value for this number usually mean high churn rate for time series.
|
||||
* Sum of `vm_rows{type="storage/big"}` and `vm_rows{type="storage/small"}` - total number of `(timestamp, value)` data points
|
||||
in the database.
|
||||
* Sum of all the `vm_cache_size_bytes` metrics - the total size of all the caches in the database.
|
||||
* `vm_allowed_memory_bytes` - the maximum allowed size for caches in the database. It is calculated as `system_memory * <-memory.allowedPercent> / 100`,
|
||||
where `system_memory` is the amount of system memory and `-memory.allowedPercent` is the corresponding flag value.
|
||||
* `vm_rows_inserted_total` - the total number of inserted rows since VictoriaMetrics start.
|
||||
* `increase(vm_new_timeseries_created_total[1h])` - time series churn rate during the previous hour.
|
||||
* `sum(vm_rows{type=~"storage/.*"})` - total number of `(timestamp, value)` data points in the database.
|
||||
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
|
||||
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
|
||||
* `sum(vm_data_size_bytes)` - the total size of data on disk.
|
||||
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
|
||||
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
|
||||
of the current number of active time series.
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
* It is recommended to use default command-line flag values (i.e. don't set them explicitly) until the need
|
||||
of tweaking these flag values arises.
|
||||
|
||||
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
since the issue could be already fixed there.
|
||||
|
||||
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
|
||||
then it is likely you have too many active time series for the current amount of RAM.
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
|
||||
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
ingestion performance.
|
||||
ingestion and query performance in this case.
|
||||
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
|
||||
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
|
||||
|
||||
* VictoriaMetrics requires free disk space for [merging data files to bigger ones](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
It may slow down when there is no enough free space left. So make sure `-storageDataPath` directory
|
||||
has at least 20% of free space comparing to disk size.
|
||||
has at least 20% of free space comparing to disk size. The remaining amount of free space
|
||||
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
|
||||
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
|
||||
|
||||
* If VictoriaMetrics doesn't work because of certain parts are corrupted due to disk errors,
|
||||
then just remove directories with broken parts. This will recover VictoriaMetrics at the cost
|
||||
@@ -898,9 +982,26 @@ The most interesting metrics are:
|
||||
If this removes gaps on the graphs, then it is likely data with timestamps older than `-search.cacheTimestampOffset`
|
||||
is ingested into VictoriaMetrics. Make sure that data sources have synchronized time with VictoriaMetrics.
|
||||
|
||||
If the gaps are related to irregular intervals between samples, then try adjusting `-search.minStalenessInterval` command-line flag
|
||||
to value close to the maximum interval between samples.
|
||||
|
||||
* If you are switching from InfluxDB or TimescaleDB, then take a look at `-search.maxStalenessInterval` command-line flag.
|
||||
It may be needed in order to suppress default gap filling algorithm used by VictoriaMetrics - by default it assumes
|
||||
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
|
||||
|
||||
* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page.
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
|
||||
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
|
||||
while `topN` equals to 10.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
|
||||
|
||||
|
||||
### Backfilling
|
||||
|
||||
VictoriaMetrics accepts historical data in arbitrary order of time.
|
||||
VictoriaMetrics accepts historical data in arbitrary order of time via [any supported ingestion method](#how-to-import-time-series-data).
|
||||
Make sure that configured `-retentionPeriod` covers timestamps for the backfilled data.
|
||||
|
||||
It is recommended disabling query cache with `-search.disableCache` command-line flag when writing
|
||||
@@ -913,6 +1014,24 @@ the query cache, which could contain incomplete data cached during the backfilli
|
||||
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
|
||||
for data with timestamps close to the current time.
|
||||
|
||||
|
||||
### Replication
|
||||
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#replication-and-data-safety) for details.
|
||||
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
See also [high availability docs](#high-availability) and [backup docs](#backups).
|
||||
|
||||
|
||||
### Backups
|
||||
|
||||
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
|
||||
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
|
||||
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
@@ -938,18 +1057,16 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
|
||||
* [netdata](https://github.com/netdata/netdata) can push data into VictoriaMetrics via `Prometheus remote_write API`.
|
||||
See [these docs](https://github.com/netdata/netdata#integrations).
|
||||
* [go-graphite/carbonapi](https://github.com/go-graphite/carbonapi) can use VictoriaMetrics as time series backend.
|
||||
See [this example](/blob/master/cmd/carbonapi/carbonapi.example.prometheus.yaml).
|
||||
* [Ansible role for installing VictoriaMetrics](https://github.com/dreamteam-gg/ansible-victoriametrics-role).
|
||||
See [this example](https://github.com/go-graphite/carbonapi/blob/master/cmd/carbonapi/carbonapi.example.prometheus.yaml).
|
||||
* [Ansible role for installing single-node VictoriaMetrics](https://github.com/dreamteam-gg/ansible-victoriametrics-role).
|
||||
* [Ansible role for installing cluster VictoriaMetrics](https://github.com/Slapper/ansible-victoriametrics-cluster-role).
|
||||
|
||||
## Roadmap
|
||||
## Third-party contributions
|
||||
|
||||
* [ ] Replication [#118](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/118)
|
||||
* [ ] Support of Object Storages (GCS, S3, Azure Storage) [#38](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/38)
|
||||
* [ ] Data downsampling [#36](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/36)
|
||||
* [ ] Alert Manager Integration [#119](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/119)
|
||||
* [ ] CLI tool for data migration, re-balancing and adding/removing nodes [#103](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/103)
|
||||
|
||||
The discussion happens [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/129). Feel free to comment on any item or add you own one.
|
||||
* [Unofficial yum repository](https://copr.fedorainfracloud.org/coprs/antonpatsev/VictoriaMetrics/) ([source code](https://github.com/patsevanton/victoriametrics-rpm))
|
||||
* [Prometheus -> VictoriaMetrics exporter #1](https://github.com/ryotarai/prometheus-tsdb-dump)
|
||||
* [Prometheus -> VictoriaMetrics exporter #2](https://github.com/AnchorFree/tsdb-remote-write)
|
||||
* [Prometheus Oauth proxy](https://gitlab.com/optima_public/prometheus_oauth_proxy) - see [this article](https://medium.com/@richard.holly/powerful-saas-solution-for-detection-metrics-c67b9208d362) for details.
|
||||
|
||||
## Contacts
|
||||
|
||||
@@ -982,17 +1099,21 @@ We are open to third-party pull requests provided they follow [KISS design princ
|
||||
|
||||
Adhering `KISS` principle simplifies the resulting code and architecture, so it can be reviewed, understood and verified by many people.
|
||||
|
||||
## Third-party contributions
|
||||
|
||||
* [Unofficial yum repository](https://copr.fedorainfracloud.org/coprs/antonpatsev/VictoriaMetrics/) ([source code](https://github.com/patsevanton/victoriametrics-rpm))
|
||||
* [Prometheus -> VictoriaMetrics exporter #1](https://github.com/ryotarai/prometheus-tsdb-dump)
|
||||
* [Prometheus -> VictoriaMetrics exporter #2](https://github.com/AnchorFree/tsdb-remote-write)
|
||||
* [Prometheus Oauth proxy](https://gitlab.com/optima_public/prometheus_oauth_proxy) - see [this article](https://medium.com/@richard.holly/powerful-saas-solution-for-detection-metrics-c67b9208d362) for details.
|
||||
|
||||
## Reporting bugs
|
||||
|
||||
Report bugs and propose new features [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues).
|
||||
|
||||
## Roadmap
|
||||
|
||||
* [ ] Replication [#118](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/118)
|
||||
* [ ] Support of Object Storages (GCS, S3, Azure Storage) [#38](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/38)
|
||||
* [ ] Data downsampling [#36](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/36)
|
||||
* [ ] Alert Manager Integration [#119](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/119)
|
||||
* [ ] CLI tool for data migration, re-balancing and adding/removing nodes [#103](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/103)
|
||||
|
||||
The discussion happens [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/129). Feel free to comment on any item or add you own one.
|
||||
|
||||
|
||||
## Victoria Metrics Logo
|
||||
|
||||
[Zip](VM_logo.zip) contains three folders with different image orientations (main color and inverted version).
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG certs_image
|
||||
FROM $certs_image AS certs
|
||||
FROM scratch
|
||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
EXPOSE 8428
|
||||
|
||||
ENTRYPOINT ["/victoria-metrics-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./victoria-metrics-prod
|
||||
EXPOSE 8428
|
||||
ENTRYPOINT ["/victoria-metrics-prod"]
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
@@ -25,6 +26,8 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
@@ -14,7 +14,11 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
)
|
||||
|
||||
var selfScrapeInterval = flag.Duration("selfScrapeInterval", 0, "Interval for self-scraping own metrics at /metrics page")
|
||||
var (
|
||||
selfScrapeInterval = flag.Duration("selfScrapeInterval", 0, "Interval for self-scraping own metrics at /metrics page")
|
||||
selfScrapeInstance = flag.String("selfScrapeInstance", "self", "Value for 'instance' label, which is added to self-scraped metrics")
|
||||
selfScrapeJob = flag.String("selfScrapeJob", "victoria-metrics", "Value for 'job' label, which is added to self-scraped metrics")
|
||||
)
|
||||
|
||||
var selfScraperStopCh chan struct{}
|
||||
var selfScraperWG sync.WaitGroup
|
||||
@@ -65,8 +69,8 @@ func selfScraper(scrapeInterval time.Duration) {
|
||||
r := &rows.Rows[i]
|
||||
labels = labels[:0]
|
||||
labels = addLabel(labels, "", r.Metric)
|
||||
labels = addLabel(labels, "job", "victoria-metrics")
|
||||
labels = addLabel(labels, "instance", "self")
|
||||
labels = addLabel(labels, "job", *selfScrapeJob)
|
||||
labels = addLabel(labels, "instance", *selfScrapeInstance)
|
||||
for j := range r.Tags {
|
||||
t := &r.Tags[j]
|
||||
labels = addLabel(labels, t.Key, t.Value)
|
||||
|
||||
16
app/victoria-metrics/testdata/graphite/empty-label-match.json
vendored
Normal file
16
app/victoria-metrics/testdata/graphite/empty-label-match.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "empty-label-match",
|
||||
"issue": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues/395",
|
||||
"data": [
|
||||
"empty_label_match 1 {TIME_S-1m}",
|
||||
"empty_label_match;foo=bar 2 {TIME_S-1m}",
|
||||
"empty_label_match;foo=baz 3 {TIME_S-1m}"],
|
||||
"query": ["/api/v1/query_range?query=empty_label_match{foo=~'bar|'}&start={TIME_S}&end={TIME_S}&step=60"],
|
||||
"result_query_range": {
|
||||
"status":"success",
|
||||
"data":{"resultType":"matrix",
|
||||
"result":[
|
||||
{"metric":{"__name__":"empty_label_match"},"values":[["{TIME_S}","1"]]},
|
||||
{"metric":{"__name__":"empty_label_match","foo":"bar"},"values":[["{TIME_S}","2"]]}
|
||||
]}}
|
||||
}
|
||||
@@ -52,8 +52,9 @@ publish-vmagent:
|
||||
APP_NAME=vmagent $(MAKE) publish-via-docker
|
||||
|
||||
run-vmagent:
|
||||
mkdir -p vmagent-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-data:/vmagent-data' \
|
||||
mkdir -p vmagent-remotewrite-data
|
||||
DOCKER_OPTS='-v $(shell pwd)/vmagent-remotewrite-data:/vmagent-remotewrite-data' \
|
||||
ARGS='-remoteWrite.url=http://localhost:8428/api/v1/write' \
|
||||
APP_NAME=vmagent \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
## vmagent
|
||||
|
||||
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
|
||||
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
|
||||
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
@@ -10,7 +11,7 @@ and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/Victori
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
|
||||
Also, we found that users’ infrastructure is like snowflakes - never alike, and we decided to add more flexibility
|
||||
Also, we found that users’ infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
|
||||
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
|
||||
|
||||
|
||||
@@ -30,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
|
||||
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
|
||||
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
|
||||
|
||||
|
||||
### Quick Start
|
||||
@@ -39,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
|
||||
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
|
||||
in order to replicate data concurrently to multiple remote storage systems.
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
|
||||
@@ -48,7 +48,7 @@ Example command line:
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
If you need collecting only Influx data, then the following command line would be enough:
|
||||
If you only need to collect Influx data, then the following is sufficient:
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
@@ -56,7 +56,7 @@ If you need collecting only Influx data, then the following command line would b
|
||||
|
||||
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
|
||||
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/).
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
|
||||
|
||||
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
|
||||
|
||||
@@ -78,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
|
||||
#### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
#### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
|
||||
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
|
||||
|
||||
@@ -93,18 +93,26 @@ Then it sends the buffered data to the remote storage in order to prevent data g
|
||||
#### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
|
||||
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
See [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
#### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
|
||||
|
||||
#### Prometheus remote_write proxy
|
||||
|
||||
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
|
||||
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
|
||||
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
|
||||
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
|
||||
|
||||
|
||||
|
||||
### How to collect metrics in Prometheus format
|
||||
|
||||
@@ -122,14 +130,26 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
|
||||
* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
|
||||
* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
|
||||
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
|
||||
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
|
||||
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
|
||||
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
|
||||
`vmagent` doesn't support `role_arn` config param yet.
|
||||
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
|
||||
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
|
||||
`vmagent` provides the following additional functionality for `gce_sd_config`:
|
||||
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
|
||||
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
|
||||
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
|
||||
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
|
||||
* `consul_sd_configs` - for scraping targets registered in Consul.
|
||||
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
|
||||
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
|
||||
|
||||
The following service discovery mechanisms will be added to `vmagent` soon:
|
||||
|
||||
* [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config)
|
||||
* [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config)
|
||||
* [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config)
|
||||
* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
|
||||
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
|
||||
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
|
||||
|
||||
|
||||
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
@@ -150,12 +170,13 @@ Additionally it provides the following extra actions:
|
||||
|
||||
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
|
||||
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to targets when parsing the file during `vmagent` startup
|
||||
or during config reload after sending `SIGHUP` signal to `vmagent` via `kill -HUP`.
|
||||
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to metrics after each scrape for the configured targets.
|
||||
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
|
||||
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
|
||||
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
|
||||
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
|
||||
|
||||
@@ -181,12 +202,23 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
|
||||
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
|
||||
since `vmagent` establishes at least a single TCP connection per each target.
|
||||
|
||||
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported by `vmagent` at `/metrics` page constantly grows.
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
|
||||
|
||||
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
|
||||
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
|
||||
|
||||
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
The directory can grow big when remote storage is unvailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
|
||||
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
|
||||
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
|
||||
### How to build from sources
|
||||
@@ -196,7 +228,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmagent` from the root folder of the repository.
|
||||
It builds `vmagent` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -211,3 +243,31 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-vmagent
|
||||
```
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG certs_image
|
||||
FROM $certs_image AS certs
|
||||
FROM scratch
|
||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
EXPOSE 8429
|
||||
|
||||
ENTRYPOINT ["/vmagent-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./vmagent-prod
|
||||
EXPOSE 8429
|
||||
ENTRYPOINT ["/vmagent-prod"]
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -39,6 +40,8 @@ var (
|
||||
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
|
||||
"Usually :4242 must be set. Doesn't work if empty")
|
||||
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
|
||||
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . See also -promscrape.config.dryRun")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -49,9 +52,27 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
|
||||
if *dryRun {
|
||||
if err := flag.Set("promscrape.config.strictParse", "true"); err != nil {
|
||||
logger.Panicf("BUG: cannot set promscrape.config.strictParse=true: %s", err)
|
||||
}
|
||||
if err := remotewrite.CheckRelabelConfigs(); err != nil {
|
||||
logger.Fatalf("error when checking relabel configs: %s", err)
|
||||
}
|
||||
if err := promscrape.CheckConfig(); err != nil {
|
||||
logger.Fatalf("error when checking Prometheus config: %s", err)
|
||||
}
|
||||
logger.Infof("all the configs are ok; exitting with 0 status code")
|
||||
return
|
||||
}
|
||||
|
||||
logger.Infof("starting vmagent at %q...", *httpListenAddr)
|
||||
startTime := time.Now()
|
||||
remotewrite.Init()
|
||||
@@ -157,6 +178,11 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
promscrape.WriteHumanReadableTargetsStatus(w)
|
||||
return true
|
||||
case "/-/reload":
|
||||
promscrapeConfigReloadRequests.Inc()
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -177,4 +203,18 @@ var (
|
||||
influxQueryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/query", protocol="influx"}`)
|
||||
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
)
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
@@ -2,34 +2,40 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/base64"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/fasthttp"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/valyala/fasthttp"
|
||||
)
|
||||
|
||||
var (
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to -remoteWrite.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
|
||||
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
|
||||
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used")
|
||||
tlsCertFile = flagutil.NewArray("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsKeyFile = flagutil.NewArray("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsCAFile = flagutil.NewArray("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
tlsServerName = flagutil.NewArray("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username to use for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password to use for -remoteWrite.url")
|
||||
bearerToken = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url")
|
||||
basicAuthUsername = flagutil.NewArray("remoteWrite.basicAuth.username", "Optional basic auth username to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
type client struct {
|
||||
@@ -50,19 +56,22 @@ type client struct {
|
||||
stopCh chan struct{}
|
||||
}
|
||||
|
||||
func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
authHeader := ""
|
||||
if len(*basicAuthUsername) > 0 || len(*basicAuthPassword) > 0 {
|
||||
username := basicAuthUsername.GetOptionalArg(argIdx)
|
||||
password := basicAuthPassword.GetOptionalArg(argIdx)
|
||||
if len(username) > 0 || len(password) > 0 {
|
||||
// See https://en.wikipedia.org/wiki/Basic_access_authentication
|
||||
token := *basicAuthUsername + ":" + *basicAuthPassword
|
||||
token := username + ":" + password
|
||||
token64 := base64.StdEncoding.EncodeToString([]byte(token))
|
||||
authHeader = "Basic " + token64
|
||||
}
|
||||
if len(*bearerToken) > 0 {
|
||||
token := bearerToken.GetOptionalArg(argIdx)
|
||||
if len(token) > 0 {
|
||||
if authHeader != "" {
|
||||
logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", *bearerToken)
|
||||
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
|
||||
}
|
||||
authHeader = "Bearer " + *bearerToken
|
||||
authHeader = "Bearer " + token
|
||||
}
|
||||
|
||||
readTimeout := *sendTimeout
|
||||
@@ -76,18 +85,18 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
|
||||
switch scheme {
|
||||
case "http", "https":
|
||||
default:
|
||||
logger.Panicf("FATAL: unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
logger.Fatalf("unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
|
||||
}
|
||||
host := string(u.Host())
|
||||
if len(host) == 0 {
|
||||
logger.Panicf("FATAL: invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
logger.Fatalf("invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
|
||||
}
|
||||
requestURI := string(u.RequestURI())
|
||||
isTLS := scheme == "https"
|
||||
var tlsCfg *tls.Config
|
||||
if isTLS {
|
||||
var err error
|
||||
tlsCfg, err = getTLSConfig()
|
||||
tlsCfg, err = getTLSConfig(argIdx)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
|
||||
}
|
||||
@@ -104,7 +113,6 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
|
||||
Addr: host,
|
||||
Name: "vmagent",
|
||||
Dial: statDial,
|
||||
DialDualStack: netutil.TCP6Enabled(),
|
||||
IsTLS: isTLS,
|
||||
TLSConfig: tlsCfg,
|
||||
MaxConns: maxConns,
|
||||
@@ -144,34 +152,19 @@ func (c *client) MustStop() {
|
||||
logger.Infof("stopped client for -remoteWrite.url=%q", c.remoteWriteURL)
|
||||
}
|
||||
|
||||
func getTLSConfig() (*tls.Config, error) {
|
||||
var tlsRootCA *x509.CertPool
|
||||
var tlsCertificate *tls.Certificate
|
||||
if *tlsCertFile != "" || *tlsKeyFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(*tlsCertFile, *tlsKeyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", *tlsCertFile, *tlsKeyFile, err)
|
||||
}
|
||||
tlsCertificate = &cert
|
||||
func getTLSConfig(argIdx int) (*tls.Config, error) {
|
||||
tlsConfig := &promauth.TLSConfig{
|
||||
CAFile: tlsCAFile.GetOptionalArg(argIdx),
|
||||
CertFile: tlsCertFile.GetOptionalArg(argIdx),
|
||||
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
|
||||
ServerName: tlsServerName.GetOptionalArg(argIdx),
|
||||
InsecureSkipVerify: *tlsInsecureSkipVerify,
|
||||
}
|
||||
if *tlsCAFile != "" {
|
||||
data, err := ioutil.ReadFile(*tlsCAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", *tlsCAFile, err)
|
||||
}
|
||||
tlsRootCA = x509.NewCertPool()
|
||||
if !tlsRootCA.AppendCertsFromPEM(data) {
|
||||
return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", *tlsCAFile)
|
||||
}
|
||||
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
|
||||
}
|
||||
tlsCfg := &tls.Config{
|
||||
RootCAs: tlsRootCA,
|
||||
ClientSessionCache: tls.NewLRUClientSessionCache(0),
|
||||
}
|
||||
if tlsCertificate != nil {
|
||||
tlsCfg.Certificates = []tls.Certificate{*tlsCertificate}
|
||||
}
|
||||
tlsCfg.InsecureSkipVerify = *tlsInsecureSkipVerify
|
||||
tlsCfg := cfg.NewTLSConfig()
|
||||
return tlsCfg, nil
|
||||
}
|
||||
|
||||
@@ -214,6 +207,7 @@ func (c *client) sendBlock(block []byte) {
|
||||
req.Header.SetMethod("POST")
|
||||
req.Header.Add("Content-Type", "application/x-protobuf")
|
||||
req.Header.Add("Content-Encoding", "snappy")
|
||||
req.Header.Add("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
if c.authHeader != "" {
|
||||
req.Header.Set("Authorization", c.authHeader)
|
||||
}
|
||||
@@ -232,8 +226,7 @@ again:
|
||||
}
|
||||
|
||||
startTime := time.Now()
|
||||
// There is no need in calling DoTimeout, since the timeout is set in c.hc.ReadTimeout.
|
||||
err := c.hc.Do(req, resp)
|
||||
err := doRequestWithPossibleRetry(c.hc, req, resp)
|
||||
c.requestDuration.UpdateDuration(startTime)
|
||||
if err != nil {
|
||||
c.errorsCount.Inc()
|
||||
@@ -266,3 +259,16 @@ again:
|
||||
fasthttp.ReleaseResponse(resp)
|
||||
fasthttp.ReleaseRequest(req)
|
||||
}
|
||||
|
||||
func doRequestWithPossibleRetry(hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response) error {
|
||||
// There is no need in calling DoTimeout, since the timeout must be already set in hc.ReadTimeout.
|
||||
err := hc.Do(req, resp)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if err != fasthttp.ErrConnectionClosed {
|
||||
return err
|
||||
}
|
||||
// Retry request if the server closed the keep-alive connection during the first attempt.
|
||||
return hc.Do(req, resp)
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -15,7 +15,8 @@ import (
|
||||
|
||||
var (
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
|
||||
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
|
||||
"Minimum supported interval is 1 second")
|
||||
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
|
||||
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
|
||||
)
|
||||
@@ -55,6 +56,10 @@ func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
|
||||
}
|
||||
|
||||
func (ps *pendingSeries) periodicFlusher() {
|
||||
flushSeconds := int64(flushInterval.Seconds())
|
||||
if flushSeconds <= 0 {
|
||||
flushSeconds = 1
|
||||
}
|
||||
ticker := time.NewTicker(*flushInterval)
|
||||
defer ticker.Stop()
|
||||
mustStop := false
|
||||
@@ -63,7 +68,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
case <-ps.stopCh:
|
||||
mustStop = true
|
||||
case <-ticker.C:
|
||||
if time.Since(ps.wr.lastFlushTime) < *flushInterval/2 {
|
||||
if fasttime.UnixTimestamp()-ps.wr.lastFlushTime < uint64(flushSeconds) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -76,7 +81,7 @@ func (ps *pendingSeries) periodicFlusher() {
|
||||
type writeRequest struct {
|
||||
wr prompbmarshal.WriteRequest
|
||||
pushBlock func(block []byte)
|
||||
lastFlushTime time.Time
|
||||
lastFlushTime uint64
|
||||
|
||||
tss []prompbmarshal.TimeSeries
|
||||
|
||||
@@ -108,7 +113,7 @@ func (wr *writeRequest) reset() {
|
||||
|
||||
func (wr *writeRequest) flush() {
|
||||
wr.wr.Timeseries = wr.tss
|
||||
wr.lastFlushTime = time.Now()
|
||||
wr.lastFlushTime = fasttime.UnixTimestamp()
|
||||
pushWriteRequest(&wr.wr, wr.pushBlock)
|
||||
wr.reset()
|
||||
}
|
||||
@@ -144,13 +149,8 @@ func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
|
||||
}
|
||||
dst.Labels = labelsDst[labelsLen:]
|
||||
|
||||
samplesDst = append(samplesDst, prompbmarshal.Sample{})
|
||||
dstSample := &samplesDst[len(samplesDst)-1]
|
||||
if len(src.Samples) != 1 {
|
||||
logger.Panicf("BUG: unexpected number of samples in time series; got %d; want 1", len(src.Samples))
|
||||
}
|
||||
*dstSample = src.Samples[0]
|
||||
dst.Samples = samplesDst[len(samplesDst)-1:]
|
||||
samplesDst = append(samplesDst, src.Samples...)
|
||||
dst.Samples = samplesDst[len(samplesDst)-len(src.Samples):]
|
||||
|
||||
wr.samples = samplesDst
|
||||
wr.labels = labelsDst
|
||||
|
||||
@@ -2,6 +2,7 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
@@ -16,35 +17,60 @@ var (
|
||||
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
|
||||
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
|
||||
"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
var labelsGlobal []prompbmarshal.Label
|
||||
var prcsGlobal []promrelabel.ParsedRelabelConfig
|
||||
|
||||
// initRelabelGlobal must be called after parsing command-line flags.
|
||||
func initRelabelGlobal() {
|
||||
// CheckRelabelConfigs checks -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig.
|
||||
func CheckRelabelConfigs() error {
|
||||
_, err := loadRelabelConfigs()
|
||||
return err
|
||||
}
|
||||
|
||||
func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
var rcs relabelConfigs
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
rcs.global = global
|
||||
}
|
||||
if len(*relabelConfigPaths) > len(*remoteWriteURLs) {
|
||||
return nil, fmt.Errorf("too many -remoteWrite.urlRelabelConfig args: %d; it mustn't exceed the number of -remoteWrite.url args: %d",
|
||||
len(*relabelConfigPaths), len(*remoteWriteURLs))
|
||||
}
|
||||
rcs.perURL = make([][]promrelabel.ParsedRelabelConfig, len(*remoteWriteURLs))
|
||||
for i, path := range *relabelConfigPaths {
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
|
||||
}
|
||||
rcs.perURL[i] = prc
|
||||
}
|
||||
return &rcs, nil
|
||||
}
|
||||
|
||||
type relabelConfigs struct {
|
||||
global []promrelabel.ParsedRelabelConfig
|
||||
perURL [][]promrelabel.ParsedRelabelConfig
|
||||
}
|
||||
|
||||
// initLabelsGlobal must be called after parsing command-line flags.
|
||||
func initLabelsGlobal() {
|
||||
// Init labelsGlobal
|
||||
labelsGlobal = nil
|
||||
for _, s := range *unparsedLabelsGlobal {
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
logger.Panicf("FATAL: missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
logger.Fatalf("missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
labelsGlobal = append(labelsGlobal, prompbmarshal.Label{
|
||||
Name: s[:n],
|
||||
Value: s[n+1:],
|
||||
})
|
||||
}
|
||||
|
||||
// Init prcsGlobal
|
||||
prcsGlobal = nil
|
||||
if len(*relabelConfigPathGlobal) > 0 {
|
||||
var err error
|
||||
prcsGlobal, err = promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, prcs []promrelabel.ParsedRelabelConfig) []prompbmarshal.TimeSeries {
|
||||
|
||||
@@ -3,6 +3,7 @@ package remotewrite
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
@@ -10,8 +11,8 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
xxhash "github.com/cespare/xxhash/v2"
|
||||
)
|
||||
@@ -20,9 +21,8 @@ var (
|
||||
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
|
||||
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
|
||||
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
|
||||
"isn't enough for sending high volume of collected data to remote storage")
|
||||
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensistive auth info")
|
||||
@@ -34,6 +34,9 @@ var (
|
||||
|
||||
var rwctxs []*remoteWriteCtx
|
||||
|
||||
// Contains the current relabelConfigs.
|
||||
var allRelabelConfigs atomic.Value
|
||||
|
||||
// Init initializes remotewrite.
|
||||
//
|
||||
// It must be called after flag.Parse().
|
||||
@@ -41,14 +44,19 @@ var rwctxs []*remoteWriteCtx
|
||||
// Stop must be called for graceful shutdown.
|
||||
func Init() {
|
||||
if len(*remoteWriteURLs) == 0 {
|
||||
logger.Panicf("FATAL: at least one `-remoteWrite.url` must be set")
|
||||
logger.Fatalf("at least one `-remoteWrite.url` must be set")
|
||||
}
|
||||
|
||||
if !*showRemoteWriteURL {
|
||||
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
|
||||
httpserver.RegisterSecretFlag("remoteWrite.url")
|
||||
}
|
||||
initRelabelGlobal()
|
||||
initLabelsGlobal()
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load relabel configs: %s", err)
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
|
||||
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
|
||||
if maxInmemoryBlocks > 200 {
|
||||
@@ -61,23 +69,47 @@ func Init() {
|
||||
maxInmemoryBlocks = 2
|
||||
}
|
||||
for i, remoteWriteURL := range *remoteWriteURLs {
|
||||
relabelConfigPath := ""
|
||||
if i < len(*relabelConfigPaths) {
|
||||
relabelConfigPath = (*relabelConfigPaths)[i]
|
||||
}
|
||||
urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
|
||||
if *showRemoteWriteURL {
|
||||
urlLabelValue = remoteWriteURL
|
||||
}
|
||||
rwctx := newRemoteWriteCtx(remoteWriteURL, relabelConfigPath, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, urlLabelValue)
|
||||
rwctxs = append(rwctxs, rwctx)
|
||||
}
|
||||
|
||||
// Start config reloader.
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
configReloaderWG.Add(1)
|
||||
go func() {
|
||||
defer configReloaderWG.Done()
|
||||
for {
|
||||
select {
|
||||
case <-sighupCh:
|
||||
case <-stopCh:
|
||||
return
|
||||
}
|
||||
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
|
||||
continue
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
logger.Infof("Successfully reloaded relabel configs")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var stopCh = make(chan struct{})
|
||||
var configReloaderWG sync.WaitGroup
|
||||
|
||||
// Stop stops remotewrite.
|
||||
//
|
||||
// It is expected that nobody calls Push during and after the call to this func.
|
||||
func Stop() {
|
||||
close(stopCh)
|
||||
configReloaderWG.Wait()
|
||||
|
||||
for _, rwctx := range rwctxs {
|
||||
rwctx.MustStop()
|
||||
}
|
||||
@@ -86,9 +118,11 @@ func Stop() {
|
||||
|
||||
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
|
||||
//
|
||||
// Each timeseries in wr.Timeseries must contain one sample.
|
||||
// Note that wr may be modified by Push due to relabeling.
|
||||
func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var rctx *relabelCtx
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcsGlobal := rcs.global
|
||||
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
|
||||
rctx = getRelabelCtx()
|
||||
}
|
||||
@@ -122,16 +156,18 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
|
||||
|
||||
type remoteWriteCtx struct {
|
||||
idx int
|
||||
fq *persistentqueue.FastQueue
|
||||
c *client
|
||||
prcs []promrelabel.ParsedRelabelConfig
|
||||
pss []*pendingSeries
|
||||
pssNextIdx uint64
|
||||
|
||||
tss []prompbmarshal.TimeSeries
|
||||
|
||||
relabelMetricsDropped *metrics.Counter
|
||||
}
|
||||
|
||||
func newRemoteWriteCtx(remoteWriteURL, relabelConfigPath string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
|
||||
h := xxhash.Sum64([]byte(remoteWriteURL))
|
||||
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
|
||||
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks, *maxPendingBytesPerURL)
|
||||
@@ -141,24 +177,16 @@ func newRemoteWriteCtx(remoteWriteURL, relabelConfigPath string, maxInmemoryBloc
|
||||
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
|
||||
return float64(fq.GetInmemoryQueueLen())
|
||||
})
|
||||
c := newClient(remoteWriteURL, urlLabelValue, fq, *queues)
|
||||
var prcs []promrelabel.ParsedRelabelConfig
|
||||
if len(relabelConfigPath) > 0 {
|
||||
var err error
|
||||
prcs, err = promrelabel.LoadRelabelConfigs(relabelConfigPath)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", relabelConfigPath, err)
|
||||
}
|
||||
}
|
||||
c := newClient(argIdx, remoteWriteURL, urlLabelValue, fq, *queues)
|
||||
pss := make([]*pendingSeries, *queues)
|
||||
for i := range pss {
|
||||
pss[i] = newPendingSeries(fq.MustWriteBlock)
|
||||
}
|
||||
return &remoteWriteCtx{
|
||||
fq: fq,
|
||||
c: c,
|
||||
prcs: prcs,
|
||||
pss: pss,
|
||||
idx: argIdx,
|
||||
fq: fq,
|
||||
c: c,
|
||||
pss: pss,
|
||||
|
||||
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, urlLabelValue)),
|
||||
}
|
||||
@@ -168,10 +196,10 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
for _, ps := range rwctx.pss {
|
||||
ps.MustStop()
|
||||
}
|
||||
rwctx.idx = 0
|
||||
rwctx.pss = nil
|
||||
rwctx.fq.MustClose()
|
||||
rwctx.fq = nil
|
||||
rwctx.prcs = nil
|
||||
rwctx.c.MustStop()
|
||||
rwctx.c = nil
|
||||
|
||||
@@ -180,10 +208,17 @@ func (rwctx *remoteWriteCtx) MustStop() {
|
||||
|
||||
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
var rctx *relabelCtx
|
||||
if len(rwctx.prcs) > 0 {
|
||||
rcs := allRelabelConfigs.Load().(*relabelConfigs)
|
||||
prcs := rcs.perURL[rwctx.idx]
|
||||
if len(prcs) > 0 {
|
||||
// Make a copy of tss before applying relabeling in order to prevent
|
||||
// from affecting time series for other remoteWrite.url configs.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467 for details.
|
||||
rwctx.tss = append(rwctx.tss[:0], tss...)
|
||||
tss = rwctx.tss
|
||||
rctx = getRelabelCtx()
|
||||
tssLen := len(tss)
|
||||
tss = rctx.applyRelabeling(tss, nil, rwctx.prcs)
|
||||
tss = rctx.applyRelabeling(tss, nil, prcs)
|
||||
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
|
||||
}
|
||||
pss := rwctx.pss
|
||||
@@ -191,5 +226,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
pss[idx].Push(tss)
|
||||
if rctx != nil {
|
||||
putRelabelCtx(rctx)
|
||||
// Zero rwctx.tss in order to free up GC references.
|
||||
rwctx.tss = prompbmarshal.ResetTimeSeries(rwctx.tss)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,17 @@ import (
|
||||
"net"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
|
||||
"github.com/VictoriaMetrics/fasthttp"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/valyala/fasthttp"
|
||||
)
|
||||
|
||||
func statDial(addr string) (net.Conn, error) {
|
||||
conn, err := fasthttp.Dial(addr)
|
||||
func statDial(addr string) (conn net.Conn, err error) {
|
||||
if netutil.TCP6Enabled() {
|
||||
conn, err = fasthttp.DialDualStack(addr)
|
||||
} else {
|
||||
conn, err = fasthttp.Dial(addr)
|
||||
}
|
||||
dialsTotal.Inc()
|
||||
if err != nil {
|
||||
dialErrors.Inc()
|
||||
|
||||
85
app/vmalert/Makefile
Normal file
85
app/vmalert/Makefile
Normal file
@@ -0,0 +1,85 @@
|
||||
# All these commands must run from repository root.
|
||||
|
||||
vmalert:
|
||||
APP_NAME=vmalert $(MAKE) app-local
|
||||
|
||||
vmalert-race:
|
||||
APP_NAME=vmalert RACE=-race $(MAKE) app-local
|
||||
|
||||
vmalert-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker
|
||||
|
||||
vmalert-pure-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-pure
|
||||
|
||||
vmalert-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-amd64
|
||||
|
||||
vmalert-arm-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-arm
|
||||
|
||||
vmalert-arm64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-arm64
|
||||
|
||||
vmalert-ppc64le-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-ppc64le
|
||||
|
||||
vmalert-386-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-386
|
||||
|
||||
package-vmalert:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker
|
||||
|
||||
package-vmalert-pure:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-pure
|
||||
|
||||
package-vmalert-amd64:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-amd64
|
||||
|
||||
package-vmalert-arm:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-arm
|
||||
|
||||
package-vmalert-arm64:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-arm64
|
||||
|
||||
package-vmalert-ppc64le:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-ppc64le
|
||||
|
||||
package-vmalert-386:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker-386
|
||||
|
||||
publish-vmalert:
|
||||
APP_NAME=vmalert $(MAKE) publish-via-docker
|
||||
|
||||
test-vmalert:
|
||||
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
||||
go test -v -race -cover ./app/vmalert/datasource
|
||||
go test -v -race -cover ./app/vmalert/notifier
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093 \
|
||||
-notifier.url=http://127.0.0.1:9093 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-evaluationInterval=3s
|
||||
|
||||
vmalert-amd64:
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-amd64 ./app/vmalert
|
||||
|
||||
vmalert-arm:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm ./app/vmalert
|
||||
|
||||
vmalert-arm64:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm64 ./app/vmalert
|
||||
|
||||
vmalert-ppc64le:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-ppc64le ./app/vmalert
|
||||
|
||||
vmalert-386:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-386 ./app/vmalert
|
||||
|
||||
vmalert-pure:
|
||||
APP_NAME=vmalert $(MAKE) app-local-pure
|
||||
@@ -1,41 +1,277 @@
|
||||
## VM Alert
|
||||
## vmalert
|
||||
|
||||
#### Abstract
|
||||
The application which accepts the alert rules, executes them on given source, sends(fires) an alert to(in) alert management system
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
|
||||
### Components
|
||||
### Features:
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
#### Alert Config Reader
|
||||
It accepts yaml config as input parameter in Prometheus format, parses it into Go struct.
|
||||
### Limitations:
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
|
||||
may fail;
|
||||
* by default, rules execution is sequential within one group, but persisting of execution results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
|
||||
recording rule is reused in next one;
|
||||
* there is no `query` function support in templates yet;
|
||||
* `vmalert` has no UI, just an API for getting groups and rules statuses.
|
||||
|
||||
#### Source Caller
|
||||
Create own watchdog for every alert group (goroutines), which executes alert query on given source and issues an alert if source returns non-empty result.
|
||||
Source can be any service which supports PromQL (MetricsQL).
|
||||
### QuickStart
|
||||
|
||||
#### Alert Management System Provider
|
||||
Send positive alert to alert management system, provides interface for every concrete implementation.
|
||||
Should be ingratiated with Prometheus alertmanager.
|
||||
To build `vmalert` from sources:
|
||||
```
|
||||
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
|
||||
cd VictoriaMetrics
|
||||
make vmalert
|
||||
```
|
||||
The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
|
||||
open questions:
|
||||
- do we really need alert group or can just run every alert in own goroutine?
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
|
||||
|
||||
#### Web Server
|
||||
Expose metrics
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
./bin/vmalert -rule=alert.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093
|
||||
```
|
||||
|
||||
open questions:
|
||||
- should the tool provide API or UI for managing alerting rules? Where to store config updated via the API or UI?
|
||||
- should the tool provide “alerting rules validation mode” for validating and debugging alerting rules? This mode is useful when creating and debugging alerting rules.
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
```
|
||||
|
||||
#### Requirements:
|
||||
- Stateless
|
||||
- Avoid external dependencies if possible
|
||||
- Reuse existing code from VictoriaMetrics repo
|
||||
- Makefile rules for common tasks – see Makefiles for other apps in the app/ dir
|
||||
- Every package should be covered by tests
|
||||
- Dockerfile
|
||||
- Graceful shutdown
|
||||
- Helm template
|
||||
- Application uses command line flags for configuration
|
||||
#### Groups
|
||||
|
||||
Each group has following attributes:
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# How many rules execute at once. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
||||
#### Rules
|
||||
|
||||
There are two types of Rules:
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
|
||||
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allow you to precompute frequently needed or computationally expensive expressions
|
||||
and save their result as a new set of time series.
|
||||
|
||||
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
|
||||
within one group.
|
||||
|
||||
##### Alerting rules
|
||||
|
||||
The syntax for alerting rule is following:
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Alerts are considered firing once they have been returned for this long.
|
||||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
labels:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
|
||||
# Annotations to add to each alert.
|
||||
annotations:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
|
||||
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
|
||||
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
|
||||
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
|
||||
address in form of timeseries with name `ALERTS` via remote-write protocol.
|
||||
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
|
||||
address by querying `ALERTS` timeseries.
|
||||
|
||||
|
||||
<img alt="VM Alert" src="vmalert.png">
|
||||
##### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
```yaml
|
||||
# The name of the time series to output to. Must be a valid metric name.
|
||||
record: <string>
|
||||
|
||||
# The MetricsQL expression to evaluate.
|
||||
expr: <string>
|
||||
|
||||
# Labels to add or overwrite before storing the result.
|
||||
labels:
|
||||
[ <labelname>: <labelvalue> ]
|
||||
```
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must specified.
|
||||
|
||||
|
||||
#### WEB
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
||||
### Configuration
|
||||
|
||||
The shortlist of configuration flags is the following:
|
||||
```
|
||||
Usage of vmalert:
|
||||
-datasource.basicAuth.password string
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
|
||||
-datasource.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
|
||||
-datasource.tlsServerName value
|
||||
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
|
||||
-notifier.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
|
||||
-notifier.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
-notifier.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
|
||||
-notifier.tlsServerName value
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
|
||||
-notifier.url string
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteRead.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
|
||||
-remoteRead.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
|
||||
-remoteRead.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of readers that concurrently write into remote storage (default 1)
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.tlsCAFile value
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
|
||||
-remoteWrite.tlsCertFile value
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile value
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
|
||||
-remoteWrite.tlsServerName value
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-rule value
|
||||
Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Whether to validate annotation and label templates (default true)
|
||||
```
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
|
||||
To reload configuration without `vmalert` restart send SIGHUP signal
|
||||
or send GET request to `/-/reload` endpoint.
|
||||
|
||||
### Contributing
|
||||
|
||||
`vmalert` is mostly designed and built by VictoriaMetrics community.
|
||||
Feel free to share your experience and ideas for improving this
|
||||
software. Please keep simplicity as the main priority.
|
||||
|
||||
### How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert` from the root folder of the repository.
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-prod` from the root folder of the repository.
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
374
app/vmalert/alerting.go
Normal file
374
app/vmalert/alerting.go
Normal file
@@ -0,0 +1,374 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For,
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: gID,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
}
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (ar *AlertingRule) String() string {
|
||||
return ar.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := q.Query(ctx, ar.Expr)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
ar.lastExecError = err
|
||||
ar.lastExecTime = time.Now()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
delete(ar.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
h := hash(m)
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
// and re-exec template since Value can be used
|
||||
// in templates
|
||||
err = ar.template(a)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ar.lastExecTime)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[h] = a
|
||||
}
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// if alert wasn't updated in this iteration
|
||||
// means it is resolved already
|
||||
if _, ok := updated[h]; !ok {
|
||||
if a.State == notifier.StatePending {
|
||||
// alert was in Pending state - it is not
|
||||
// active anymore
|
||||
delete(ar.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
if series {
|
||||
return ar.toTimeSeries(ar.lastExecTime), nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
ts := ar.alertToTimeSeries(a, timestamp)
|
||||
tss = append(tss, ts...)
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
return a, ar.template(a)
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) template(a *notifier.Alert) error {
|
||||
// 1. template rule labels with data labels
|
||||
rLabels, err := a.ExecTemplate(ar.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 2. merge data labels and rule labels
|
||||
// metric labels may be overridden by
|
||||
// rule labels
|
||||
for k, v := range rLabels {
|
||||
a.Labels[k] = v
|
||||
}
|
||||
|
||||
// 3. template merged labels
|
||||
a.Labels, err = a.ExecTemplate(a.Labels)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.Annotations, err = a.ExecTemplate(ar.Annotations)
|
||||
return err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIAlertingRule
|
||||
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
||||
var lastErr string
|
||||
if ar.lastExecError != nil {
|
||||
lastErr = ar.lastExecError.Error()
|
||||
}
|
||||
return APIAlertingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
Name: ar.Name,
|
||||
Expression: ar.Expr,
|
||||
For: ar.For.String(),
|
||||
LastError: lastErr,
|
||||
LastExec: ar.lastExecTime,
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
}
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
return &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
alertNameLabel = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
alertStateLabel = "alertstate"
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
|
||||
if ar.For > 0 {
|
||||
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
|
||||
}
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
labels := m.Labels
|
||||
m.Labels = make([]datasource.Label, 0)
|
||||
// drop all extra labels, so hash key will
|
||||
// be identical to timeseries received in Exec
|
||||
for _, l := range labels {
|
||||
if l.Name == alertNameLabel {
|
||||
continue
|
||||
}
|
||||
// drop all overridden labels
|
||||
if _, ok := ar.Labels[l.Name]; ok {
|
||||
continue
|
||||
}
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.State = notifier.StatePending
|
||||
ar.alerts[a.ID] = a
|
||||
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
455
app/vmalert/alerting_test.go
Normal file
455
app/vmalert/alerting_test.go
Normal file
@@ -0,0 +1,455 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
alert *notifier.Alert
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
newTestAlertingRule("instant", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("instant extra labels", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
}},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant extra labels",
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("instant labels override", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
||||
alertStateLabel: "foo",
|
||||
"__name__": "bar",
|
||||
}},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant labels override",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "for",
|
||||
}, timestamp),
|
||||
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StatePending.String(),
|
||||
alertNameLabel: "for pending",
|
||||
}, timestamp),
|
||||
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for pending",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
tc.rule.alerts[tc.alert.ID] = tc.alert
|
||||
tss := tc.rule.toTimeSeries(timestamp)
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertingRule_Exec(t *testing.T) {
|
||||
const defaultStep = 5 * time.Millisecond
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
steps [][]datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestAlertingRule("empty", 0),
|
||||
[][]datasource.Metric{},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[][]datasource.Metric{
|
||||
{datasource.Metric{}},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(datasource.Metric{}): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("multiple-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
metricWithLabels(t, "name", "foo"),
|
||||
metricWithLabels(t, "name", "foo1"),
|
||||
metricWithLabels(t, "name", "foo2"),
|
||||
},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateFiring},
|
||||
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("multiple-steps-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo1")},
|
||||
{metricWithLabels(t, "name", "foo2")},
|
||||
},
|
||||
// 1: fire first alert
|
||||
// 2: fire second alert, set first inactive
|
||||
// 3: fire third alert, set second inactive, delete first one
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateInactive},
|
||||
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("duplicate", 0),
|
||||
[][]datasource.Metric{
|
||||
{
|
||||
// metrics with the same labelset should result in one alert
|
||||
metricWithLabels(t, "name", "foo", "type", "bar"),
|
||||
metricWithLabels(t, "type", "bar", "name", "foo"),
|
||||
},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo", "type", "bar")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending", time.Minute),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-fired", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>empty", time.Second),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset and delete pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
}
|
||||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
time.Sleep(defaultStep)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
|
||||
}
|
||||
for key, exp := range tc.expAlerts {
|
||||
got, ok := tc.rule.alerts[key]
|
||||
if !ok {
|
||||
t.Fatalf("expected to have key %d", key)
|
||||
}
|
||||
if got.State != exp.State {
|
||||
t.Fatalf("expected state %d; got %d", exp.State, got.State)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertingRule_Restore(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
metrics []datasource.Metric
|
||||
expAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestRuleWithLabels("no extra labels"),
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
alertNameLabel, "",
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(datasource.Metric{}): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRuleWithLabels("metric labels"),
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
alertNameLabel, "",
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t,
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
)): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRuleWithLabels("rule labels", "source", "vm"),
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
alertNameLabel, "",
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
// following pair supposed to be dropped
|
||||
"source", "vm",
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t,
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
)): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRuleWithLabels("multiple alerts"),
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
"host", "localhost-1",
|
||||
),
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(2*time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
"host", "localhost-2",
|
||||
),
|
||||
metricWithValueAndLabels(t, float64(time.Now().Truncate(3*time.Hour).Unix()),
|
||||
"__name__", alertForStateMetricName,
|
||||
"host", "localhost-3",
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(2 * time.Hour)},
|
||||
hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(3 * time.Hour)},
|
||||
},
|
||||
},
|
||||
}
|
||||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
|
||||
}
|
||||
for key, exp := range tc.expAlerts {
|
||||
got, ok := tc.rule.alerts[key]
|
||||
if !ok {
|
||||
t.Fatalf("expected to have key %d", key)
|
||||
}
|
||||
if got.State != exp.State {
|
||||
t.Fatalf("expected state %d; got %d", exp.State, got.State)
|
||||
}
|
||||
if got.Start != exp.Start {
|
||||
t.Fatalf("expected Start %v; got %v", exp.Start, got.Start)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
r.Labels[labels[i]] = labels[i+1]
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
}
|
||||
195
app/vmalert/config/config.go
Normal file
195
app/vmalert/config/config.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// Group contains list of Rules grouped into
|
||||
// entity with one name and evaluation interval
|
||||
type Group struct {
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// Validate check for internal Group or Rule configuration errors
|
||||
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("group name must be set")
|
||||
}
|
||||
if len(g.Rules) == 0 {
|
||||
return fmt.Errorf("group %q can't contain no rules", g.Name)
|
||||
}
|
||||
uniqueRules := map[uint64]struct{}{}
|
||||
for _, r := range g.Rules {
|
||||
ruleName := r.Record
|
||||
if r.Alert != "" {
|
||||
ruleName = r.Alert
|
||||
}
|
||||
if _, ok := uniqueRules[r.ID]; ok {
|
||||
return fmt.Errorf("rule %q duplicate", ruleName)
|
||||
}
|
||||
uniqueRules[r.ID] = struct{}{}
|
||||
if err := r.Validate(); err != nil {
|
||||
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if validateExpressions {
|
||||
if _, err := metricsql.Parse(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
if validateAnnotations {
|
||||
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
|
||||
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
if err := notifier.ValidateTemplates(r.Labels); err != nil {
|
||||
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
|
||||
}
|
||||
|
||||
// Rule describes entity that represent either
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
ID uint64
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For time.Duration `yaml:"for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type rule Rule
|
||||
if err := unmarshal((*rule)(r)); err != nil {
|
||||
return err
|
||||
}
|
||||
r.ID = HashRule(*r)
|
||||
return nil
|
||||
}
|
||||
|
||||
// HashRule hashes significant Rule fields into
|
||||
// unique hash value
|
||||
func HashRule(r Rule) uint64 {
|
||||
h := fnv.New64a()
|
||||
h.Write([]byte(r.Expr))
|
||||
if r.Record != "" {
|
||||
h.Write([]byte("recording"))
|
||||
h.Write([]byte(r.Record))
|
||||
} else {
|
||||
h.Write([]byte("alerting"))
|
||||
h.Write([]byte(r.Alert))
|
||||
}
|
||||
type item struct {
|
||||
key, value string
|
||||
}
|
||||
var kv []item
|
||||
for k, v := range r.Labels {
|
||||
kv = append(kv, item{key: k, value: v})
|
||||
}
|
||||
sort.Slice(kv, func(i, j int) bool {
|
||||
return kv[i].key < kv[j].key
|
||||
})
|
||||
for _, i := range kv {
|
||||
h.Write([]byte(i.key))
|
||||
h.Write([]byte(i.value))
|
||||
h.Write([]byte("\xff"))
|
||||
}
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// Validate check for Rule configuration errors
|
||||
func (r *Rule) Validate() error {
|
||||
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
|
||||
return fmt.Errorf("either `record` or `alert` must be set")
|
||||
}
|
||||
if r.Expr == "" {
|
||||
return fmt.Errorf("expression can't be empty")
|
||||
}
|
||||
return checkOverflow(r.XXX, "rule")
|
||||
}
|
||||
|
||||
// Parse parses rule configs from given file patterns
|
||||
func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool) ([]Group, error) {
|
||||
var fp []string
|
||||
for _, pattern := range pathPatterns {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
|
||||
}
|
||||
fp = append(fp, matches...)
|
||||
}
|
||||
var groups []Group
|
||||
for _, file := range fp {
|
||||
uniqueGroups := map[string]struct{}{}
|
||||
gr, err := parseFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
|
||||
}
|
||||
for _, g := range gr {
|
||||
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
|
||||
return nil, fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err)
|
||||
}
|
||||
if _, ok := uniqueGroups[g.Name]; ok {
|
||||
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
|
||||
}
|
||||
uniqueGroups[g.Name] = struct{}{}
|
||||
g.File = file
|
||||
groups = append(groups, g)
|
||||
}
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
}
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func parseFile(path string) ([]Group, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading alert rule file: %w", err)
|
||||
}
|
||||
g := struct {
|
||||
Groups []Group `yaml:"groups"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}{}
|
||||
err = yaml.Unmarshal(data, &g)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return g.Groups, checkOverflow(g.XXX, "config")
|
||||
}
|
||||
|
||||
func checkOverflow(m map[string]interface{}, ctx string) error {
|
||||
if len(m) > 0 {
|
||||
var keys []string
|
||||
for k := range m {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
326
app/vmalert/config/config_test.go
Normal file
326
app/vmalert/config/config_test.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestParseGood(t *testing.T) {
|
||||
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true, true); err != nil {
|
||||
t.Errorf("error parsing files %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBad(t *testing.T) {
|
||||
testCases := []struct {
|
||||
path []string
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
[]string{"testdata/rules0-bad.rules"},
|
||||
"unexpected token",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules0-bad.rules"},
|
||||
"error parsing annotation",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules1-bad.rules"},
|
||||
"duplicate in file",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules2-bad.rules"},
|
||||
"function \"value\" not defined",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules3-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/dir/rules4-bad.rules"},
|
||||
"either `record` or `alert` must be set",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/*.yaml"},
|
||||
"no groups found",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
_, err := Parse(tc.path, true, true)
|
||||
if err == nil {
|
||||
t.Errorf("expected to get error")
|
||||
return
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRule_Validate(t *testing.T) {
|
||||
if err := (&Rule{}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty name error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
|
||||
t.Errorf("exptected empty expr error")
|
||||
}
|
||||
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
|
||||
t.Errorf("exptected valid rule; got %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroup_Validate(t *testing.T) {
|
||||
testCases := []struct {
|
||||
group *Group
|
||||
rules []Rule
|
||||
validateAnnotations bool
|
||||
validateExpressions bool
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
group: &Group{},
|
||||
expErr: "group name must be set",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test"},
|
||||
expErr: "contain no rules",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Record: "record",
|
||||
Expr: "up | 0",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "invalid expression",
|
||||
validateExpressions: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "error parsing annotation",
|
||||
validateAnnotations: true,
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
{
|
||||
Alert: "alert",
|
||||
Expr: "up == 1",
|
||||
},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Record: "record", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "duplicate",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"description": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
{
|
||||
group: &Group{Name: "test",
|
||||
Rules: []Rule{
|
||||
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"summary": "{{ value|query }}",
|
||||
}},
|
||||
},
|
||||
},
|
||||
expErr: "",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
|
||||
if err == nil {
|
||||
if tc.expErr != "" {
|
||||
t.Errorf("expected to get err %q; got nil insted", tc.expErr)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(err.Error(), tc.expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHashRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
a, b Rule
|
||||
equal bool
|
||||
}{
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "bar",
|
||||
}},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: time.Minute},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Record: "record", Expr: "up == 1"},
|
||||
Rule{Record: "record", Expr: "up == 2"},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
"foo": "baz",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"baz": "foo",
|
||||
}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "foo",
|
||||
}},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
false,
|
||||
},
|
||||
}
|
||||
for i, tc := range testCases {
|
||||
aID, bID := HashRule(tc.a), HashRule(tc.b)
|
||||
if tc.equal != (aID == bID) {
|
||||
t.Fatalf("missmatch for rule %d", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
package config
|
||||
|
||||
import "time"
|
||||
|
||||
// Rule is basic alert entity
|
||||
type Rule struct {
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
}
|
||||
|
||||
// Group grouping array of alert
|
||||
type Group struct {
|
||||
Name string
|
||||
Rules []Rule
|
||||
}
|
||||
|
||||
// Parse parses config from given file
|
||||
func Parse(filepath string) ([]Group, error) {
|
||||
return []Group{{
|
||||
Name: "foobar",
|
||||
Rules: []Rule{{
|
||||
Name: "vmrowsalert",
|
||||
Expr: "vm_rows",
|
||||
For: 1 * time.Second,
|
||||
Labels: map[string]string{
|
||||
"alert_label": "value1",
|
||||
"alert_label2": "value2",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value }}",
|
||||
"description": "LABELS: {{ $labels }}",
|
||||
},
|
||||
}},
|
||||
}}, nil
|
||||
}
|
||||
19
app/vmalert/config/testdata/dir/rules0-bad.rules
vendored
Normal file
19
app/vmalert/config/testdata/dir/rules0-bad.rules
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: InvalidAnnotations
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }"
|
||||
description: "{{$labels}}"
|
||||
- alert: UnkownAnnotationsFunction
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ value|query }}"
|
||||
description: "{{$labels}}"
|
||||
14
app/vmalert/config/testdata/dir/rules0-good.rules
vendored
Normal file
14
app/vmalert/config/testdata/dir/rules0-good.rules
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
groups:
|
||||
- name: duplicatedGroupDiffFiles
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
expr: "{{ $expr|queryEscape }}"
|
||||
annotations:
|
||||
summary: "{{ $value|humanize }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
|
||||
22
app/vmalert/config/testdata/dir/rules1-bad.rules
vendored
Normal file
22
app/vmalert/config/testdata/dir/rules1-bad.rules
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
groups:
|
||||
- name: sameGroup
|
||||
rules:
|
||||
- alert: alert
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
- name: sameGroup
|
||||
rules:
|
||||
- alert: alert
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
11
app/vmalert/config/testdata/dir/rules1-good.rules
vendored
Normal file
11
app/vmalert/config/testdata/dir/rules1-good.rules
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
groups:
|
||||
- name: duplicatedGroupDiffFiles
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
11
app/vmalert/config/testdata/dir/rules2-bad.rules
vendored
Normal file
11
app/vmalert/config/testdata/dir/rules2-bad.rules
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: UnkownLabelFunction
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
summary: "{{ value|query }}"
|
||||
annotations:
|
||||
description: "{{$labels}}"
|
||||
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
5
app/vmalert/config/testdata/dir/rules3-bad.rules
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules4-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
record: record
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
7
app/vmalert/config/testdata/dir/rules5-bad.rules
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: rows
|
||||
expr: vm_rows > 0
|
||||
- record: rows
|
||||
expr: sum(vm_rows)
|
||||
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
1727
app/vmalert/config/testdata/kube-good.rules
vendored
Normal file
File diff suppressed because it is too large
Load Diff
28
app/vmalert/config/testdata/rules0-bad.rules
vendored
Normal file
28
app/vmalert/config/testdata/rules0-bad.rules
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
groups:
|
||||
- name: group
|
||||
rules:
|
||||
- alert: InvalidExpr
|
||||
for: 5m
|
||||
expr: vm_rows{ > 0
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
- alert: EmptyExpr
|
||||
for: 5m
|
||||
expr: ""
|
||||
labels:
|
||||
label: bar
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
- alert: ""
|
||||
for: 5m
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: foo
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
23
app/vmalert/config/testdata/rules0-good.rules
vendored
Normal file
23
app/vmalert/config/testdata/rules0-good.rules
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
groups:
|
||||
- name: groupGorSingleAlert
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 10s
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value|humanize }}"
|
||||
description: "{{$labels}}"
|
||||
|
||||
- name: TestGroup
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
annotations:
|
||||
summary: "Too high connection number for {{$labels.instance}}"
|
||||
description: "It is {{ $value }} connections for {{$labels.instance}}"
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job)
|
||||
(up == 1)
|
||||
11
app/vmalert/config/testdata/rules1-good.rules
vendored
Normal file
11
app/vmalert/config/testdata/rules1-good.rules
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
groups:
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
35
app/vmalert/config/testdata/rules2-good.rules
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
groups:
|
||||
- name: TestGroup
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
for: 3m
|
||||
annotations:
|
||||
summary: "Too high connection number for {{$labels.instance}}"
|
||||
description: "It is {{ $value }} connections for {{$labels.instance}}"
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job)
|
||||
(up == 1)
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
labels:
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: dev
|
||||
recording: true
|
||||
- record: code:requests:rate5m
|
||||
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
|
||||
labels:
|
||||
env: staging
|
||||
recording: true
|
||||
- record: successful_requests:ratio_rate5m
|
||||
labels:
|
||||
recording: true
|
||||
expr: |2
|
||||
sum(code:requests:rate5m{code="200"})
|
||||
/
|
||||
sum(code:requests:rate5m)
|
||||
@@ -1,5 +1,14 @@
|
||||
package datasource
|
||||
|
||||
import "context"
|
||||
|
||||
// Querier interface wraps Query method which
|
||||
// executes given query and returns list of Metrics
|
||||
// as result
|
||||
type Querier interface {
|
||||
Query(ctx context.Context, query string) ([]Metric, error)
|
||||
}
|
||||
|
||||
// Metric is the basic entity which should be return by datasource
|
||||
// It represents single data point with full list of labels
|
||||
type Metric struct {
|
||||
|
||||
38
app/vmalert/datasource/init.go
Normal file
38
app/vmalert/datasource/init.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
|
||||
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
|
||||
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
|
||||
"By default the server name from -datasource.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
func Init() (Querier, error) {
|
||||
if *addr == "" {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("datasource.url is empty")
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
@@ -32,7 +32,7 @@ func (r response) metrics() ([]Metric, error) {
|
||||
for i, res := range r.Data.Result {
|
||||
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
|
||||
}
|
||||
m.Labels = nil
|
||||
for k, v := range r.Data.Result[i].Labels {
|
||||
@@ -49,9 +49,10 @@ const queryPath = "/api/v1/query?query="
|
||||
|
||||
// VMStorage represents vmstorage entity with ability to read and write metrics
|
||||
type VMStorage struct {
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser, basicAuthPass string
|
||||
c *http.Client
|
||||
queryURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
}
|
||||
|
||||
// NewVMStorage is a constructor for VMStorage
|
||||
@@ -79,25 +80,25 @@ func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
|
||||
}
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s with err %w. Reponse body %s", resp.StatusCode, req.URL, err, body)
|
||||
}
|
||||
r := &response{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
|
||||
return nil, fmt.Errorf("error parsing metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
|
||||
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
}
|
||||
if r.Data.ResultType != rtVector {
|
||||
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
return nil, fmt.Errorf("unknown restul type:%s. Expected vector", r.Data.ResultType)
|
||||
}
|
||||
return r.metrics()
|
||||
}
|
||||
|
||||
8
app/vmalert/deployment/Dockerfile
Normal file
8
app/vmalert/deployment/Dockerfile
Normal file
@@ -0,0 +1,8 @@
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
EXPOSE 8880
|
||||
|
||||
ENTRYPOINT ["/vmalert-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./vmalert-prod
|
||||
300
app/vmalert/group.go
Normal file
300
app/vmalert/group.go
Normal file
@@ -0,0 +1,300 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
|
||||
doneCh chan struct{}
|
||||
finishedCh chan struct{}
|
||||
// channel accepts new Group obj
|
||||
// which supposed to update current group
|
||||
updateCh chan *Group
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
g := &Group{
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval,
|
||||
Concurrency: cfg.Concurrency,
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
if g.Interval == 0 {
|
||||
g.Interval = defaultInterval
|
||||
}
|
||||
if g.Concurrency < 1 {
|
||||
g.Concurrency = 1
|
||||
}
|
||||
rules := make([]Rule, len(cfg.Rules))
|
||||
for i, r := range cfg.Rules {
|
||||
rules[i] = g.newRule(r)
|
||||
}
|
||||
g.Rules = rules
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(g.ID(), rule)
|
||||
}
|
||||
return newRecordingRule(g.ID(), rule)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
// rules file and group name
|
||||
func (g *Group) ID() uint64 {
|
||||
hash := fnv.New64a()
|
||||
hash.Write([]byte(g.File))
|
||||
hash.Write([]byte("\xff"))
|
||||
hash.Write([]byte(g.Name))
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateWith updates existing group with
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
for _, nr := range newGroup.Rules {
|
||||
rulesRegistry[nr.ID()] = nr
|
||||
}
|
||||
|
||||
for i, or := range g.Rules {
|
||||
nr, ok := rulesRegistry[or.ID()]
|
||||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// so we mark it for removing
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
}
|
||||
|
||||
var newRules []Rule
|
||||
for _, r := range g.Rules {
|
||||
if r == nil {
|
||||
// skip nil rules
|
||||
continue
|
||||
}
|
||||
newRules = append(newRules, r)
|
||||
}
|
||||
// add the rest of rules from registry
|
||||
for _, nr := range rulesRegistry {
|
||||
newRules = append(newRules, nr)
|
||||
}
|
||||
g.Concurrency = newGroup.Concurrency
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
|
||||
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
|
||||
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
||||
|
||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
||||
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
||||
|
||||
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
)
|
||||
|
||||
func (g *Group) close() {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
}
|
||||
close(g.doneCh)
|
||||
<-g.finishedCh
|
||||
}
|
||||
|
||||
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{querier, nts, rw}
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Infof("group %q: context cancelled", g.Name)
|
||||
return
|
||||
case <-g.doneCh:
|
||||
logger.Infof("group %q: received stop signal", g.Name)
|
||||
return
|
||||
case ng := <-g.updateCh:
|
||||
g.mu.Lock()
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||
g.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
if g.Interval != ng.Interval {
|
||||
g.Interval = ng.Interval
|
||||
t.Stop()
|
||||
t = time.NewTicker(g.Interval)
|
||||
}
|
||||
g.mu.Unlock()
|
||||
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
case <-t.C:
|
||||
iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
iterationDuration.UpdateDuration(iterationStart)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type executor struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
var returnSeries bool
|
||||
if e.rw != nil {
|
||||
returnSeries = true
|
||||
}
|
||||
|
||||
if concurrency == 1 {
|
||||
// fast path
|
||||
for _, rule := range rules {
|
||||
res <- e.exec(ctx, rule, returnSeries, interval)
|
||||
}
|
||||
close(res)
|
||||
return res
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, rule := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, returnSeries, interval)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
}()
|
||||
return res
|
||||
}
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
|
||||
execTotal.Inc()
|
||||
execStart := time.Now()
|
||||
defer func() {
|
||||
execDuration.UpdateDuration(execStart)
|
||||
}()
|
||||
|
||||
tss, err := rule.Exec(ctx, e.querier, returnSeries)
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
}
|
||||
|
||||
if len(tss) > 0 && e.rw != nil {
|
||||
remoteWriteSent.Add(len(tss))
|
||||
for _, ts := range tss {
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
// set End to execStart + 3 intervals
|
||||
// so notifier can resolve it automatically if `vmalert`
|
||||
// won't be able to send resolve for some reason
|
||||
a.End = time.Now().Add(3 * interval)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = time.Now()
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
alertsSent.Add(len(alerts))
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers {
|
||||
if err := nt.Send(ctx, alerts); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||
}
|
||||
}
|
||||
return errGr.Err()
|
||||
}
|
||||
207
app/vmalert/group_test.go
Normal file
207
app/vmalert/group_test.go
Normal file
@@ -0,0 +1,207 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
currentRules []config.Rule
|
||||
newRules []config.Rule
|
||||
}{
|
||||
{
|
||||
"new rule",
|
||||
nil,
|
||||
[]config.Rule{{Alert: "bar"}},
|
||||
},
|
||||
{
|
||||
"update alerting rule",
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: time.Second,
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "none",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "max(up)",
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Record: "foo",
|
||||
Expr: "min(up)",
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"empty rule",
|
||||
[]config.Rule{{Alert: "foo"}, {Record: "bar"}},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
"multiple rules",
|
||||
[]config.Rule{
|
||||
{Alert: "bar"},
|
||||
{Alert: "baz"},
|
||||
{Alert: "foo"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "baz"},
|
||||
{Record: "foo"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"replace rule",
|
||||
[]config.Rule{{Alert: "foo1"}},
|
||||
[]config.Rule{{Alert: "foo2"}},
|
||||
},
|
||||
{
|
||||
"replace multiple rules",
|
||||
[]config.Rule{
|
||||
{Alert: "foo1"},
|
||||
{Record: "foo2"},
|
||||
{Alert: "foo3"},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "foo3"},
|
||||
{Alert: "foo4"},
|
||||
{Record: "foo5"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Name: "test"}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(r))
|
||||
}
|
||||
|
||||
ng := &Group{Name: "test"}
|
||||
for _, r := range tc.newRules {
|
||||
r.ID = config.HashRule(r)
|
||||
ng.Rules = append(ng.Rules, ng.newRule(r))
|
||||
}
|
||||
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(g.Rules) != len(tc.newRules) {
|
||||
t.Fatalf("expected to have %d rules; got: %d",
|
||||
len(g.Rules), len(tc.newRules))
|
||||
}
|
||||
sort.Slice(g.Rules, func(i, j int) bool {
|
||||
return g.Rules[i].ID() < g.Rules[j].ID()
|
||||
})
|
||||
sort.Slice(ng.Rules, func(i, j int) bool {
|
||||
return ng.Rules[i].ID() < ng.Rules[j].ID()
|
||||
})
|
||||
for i, r := range g.Rules {
|
||||
got, want := r, ng.Rules[i]
|
||||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true, true)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval)
|
||||
g.Concurrency = 2
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fs := &fakeQuerier{}
|
||||
|
||||
const inst1, inst2, job = "foo", "bar", "baz"
|
||||
m1 := metricWithLabels(t, "instance", inst1, "job", job)
|
||||
m2 := metricWithLabels(t, "instance", inst2, "job", job)
|
||||
|
||||
r := g.Rules[0].(*AlertingRule)
|
||||
alert1, err := r.newAlert(m1, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert1.State = notifier.StateFiring
|
||||
alert1.ID = hash(m1)
|
||||
|
||||
alert2, err := r.newAlert(m2, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
alert2.State = notifier.StateFiring
|
||||
alert2.ID = hash(m2)
|
||||
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts := fn.getAlerts()
|
||||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
// and set only one datapoint for response
|
||||
fs.add(m1)
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts = fn.getAlerts()
|
||||
expectedAlerts = []notifier.Alert{*alert1}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
g.close()
|
||||
<-finished
|
||||
}
|
||||
232
app/vmalert/helpers_test.go
Normal file
232
app/vmalert/helpers_test.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
err error
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) setErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return nil, fq.err
|
||||
}
|
||||
cp := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
return cp, nil
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := compareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparsion error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareRules(t *testing.T, a, b Rule) error {
|
||||
t.Helper()
|
||||
switch v := a.(type) {
|
||||
case *AlertingRule:
|
||||
br, ok := b.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
|
||||
}
|
||||
return compareAlertingRules(t, v, br)
|
||||
case *RecordingRule:
|
||||
br, ok := b.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
|
||||
}
|
||||
return compareRecordingRules(t, v, br)
|
||||
default:
|
||||
return fmt.Errorf("unexpected rule type received %T", a)
|
||||
}
|
||||
}
|
||||
|
||||
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
||||
t.Helper()
|
||||
if a.Expr != b.Expr {
|
||||
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
|
||||
}
|
||||
if a.For != b.For {
|
||||
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
|
||||
t.Helper()
|
||||
if len(a) != len(b) {
|
||||
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
|
||||
}
|
||||
for i := range a {
|
||||
expTS, gotTS := a[i], b[i]
|
||||
if len(expTS.Samples) != len(gotTS.Samples) {
|
||||
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
|
||||
}
|
||||
for i, exp := range expTS.Samples {
|
||||
got := gotTS.Samples[i]
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
|
||||
}
|
||||
// timestamp validation isn't always correct for now.
|
||||
// this must be improved with time mock.
|
||||
/*if got.Timestamp != exp.Timestamp {
|
||||
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
|
||||
}*/
|
||||
}
|
||||
if len(expTS.Labels) != len(gotTS.Labels) {
|
||||
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
|
||||
}
|
||||
for i, exp := range expTS.Labels {
|
||||
got := gotTS.Labels[i]
|
||||
if got.Name != exp.Name {
|
||||
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
if got.Value != exp.Value {
|
||||
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
|
||||
t.Helper()
|
||||
if len(as) != len(bs) {
|
||||
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
|
||||
}
|
||||
sort.Slice(as, func(i, j int) bool {
|
||||
return as[i].ID < as[j].ID
|
||||
})
|
||||
sort.Slice(bs, func(i, j int) bool {
|
||||
return bs[i].ID < bs[j].ID
|
||||
})
|
||||
for i := range as {
|
||||
a, b := as[i], bs[i]
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.State != b.State {
|
||||
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
|
||||
}
|
||||
if a.Value != b.Value {
|
||||
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,131 +4,194 @@ import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/provider"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
configPath = flag.String("config", "config.yaml", "Path to alert configuration file")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule /path/to/file. Path to a single file with alerting rules
|
||||
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.`)
|
||||
|
||||
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter. e.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username to use for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password to use for -datasource.url")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", 1*time.Minute, "How often to evaluate the rules. Default 1m")
|
||||
providerURL = flag.String("provider.url", "", "Prometheus alertmanager url. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
logger.Infof("reading alert rules configuration file from %s", *configPath)
|
||||
alertGroups, err := config.Parse(*configPath)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
manager, err := newManager(ctx)
|
||||
if err != nil {
|
||||
logger.Fatalf("Cannot parse configuration file %s", err)
|
||||
logger.Fatalf("failed to init: %s", err)
|
||||
}
|
||||
addr := getWebServerAddr(*httpListenAddr, false)
|
||||
w := &watchdog{
|
||||
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
|
||||
alertProvider: provider.NewAlertManager(*providerURL, func(group, name string) string {
|
||||
return addr + fmt.Sprintf("/%s/%s/status", group, name)
|
||||
}, &http.Client{}),
|
||||
}
|
||||
for id := range alertGroups {
|
||||
go func(group config.Group) {
|
||||
w.run(ctx, group, *evaluationInterval)
|
||||
}(alertGroups[id])
|
||||
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
|
||||
logger.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
go func() {
|
||||
httpserver.Serve(*httpListenAddr, func(w http.ResponseWriter, r *http.Request) bool {
|
||||
panic("not implemented")
|
||||
})
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
sigHup := procutil.NewSighupChan()
|
||||
for {
|
||||
<-sigHup
|
||||
configReloads.Inc()
|
||||
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
|
||||
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("error while reloading rules: %s", err)
|
||||
continue
|
||||
}
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}()
|
||||
|
||||
rh := &requestHandler{m: manager}
|
||||
go httpserver.Serve(*httpListenAddr, rh.handler)
|
||||
|
||||
sig := procutil.WaitForSigterm()
|
||||
logger.Infof("service received signal %s", sig)
|
||||
if err := httpserver.Stop(*httpListenAddr); err != nil {
|
||||
logger.Fatalf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
cancel()
|
||||
w.stop()
|
||||
manager.close()
|
||||
}
|
||||
|
||||
type watchdog struct {
|
||||
storage *datasource.VMStorage
|
||||
alertProvider provider.AlertProvider
|
||||
}
|
||||
var (
|
||||
configReloads = metrics.NewCounter(`vmalert_config_last_reload_total`)
|
||||
configReloadErrors = metrics.NewCounter(`vmalert_config_last_reload_errors_total`)
|
||||
configSuccess = metrics.NewCounter(`vmalert_config_last_reload_successful`)
|
||||
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
|
||||
)
|
||||
|
||||
func (w *watchdog) run(ctx context.Context, a config.Group, evaluationInterval time.Duration) {
|
||||
t := time.NewTicker(evaluationInterval)
|
||||
var metrics []datasource.Metric
|
||||
var err error
|
||||
var alerts []provider.Alert
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
for _, r := range a.Rules {
|
||||
if metrics, err = w.storage.Query(ctx, r.Expr); err != nil {
|
||||
logger.Errorf("error reading metrics %s", err)
|
||||
continue
|
||||
}
|
||||
// todo check for and calculate alert states
|
||||
if len(metrics) < 1 {
|
||||
continue
|
||||
}
|
||||
alerts = provider.AlertsFromMetrics(metrics, a.Name, r)
|
||||
// todo save to storage
|
||||
if err := w.alertProvider.Send(alerts); err != nil {
|
||||
logger.Errorf("error sending alerts %s", err)
|
||||
continue
|
||||
}
|
||||
// todo is alert still active/pending?
|
||||
}
|
||||
|
||||
case <-ctx.Done():
|
||||
logger.Infof("%s receive stop signal", a.Name)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getWebServerAddr(httpListenAddr string, isSecure bool) string {
|
||||
if strings.Index(httpListenAddr, ":") != 0 {
|
||||
if isSecure {
|
||||
return "https://" + httpListenAddr
|
||||
}
|
||||
return "http://" + httpListenAddr
|
||||
}
|
||||
|
||||
addrs, err := net.InterfaceAddrs()
|
||||
func newManager(ctx context.Context) (*manager, error) {
|
||||
q, err := datasource.Init()
|
||||
if err != nil {
|
||||
panic("error getting the interface addresses ")
|
||||
return nil, fmt.Errorf("failed to init datasource: %w", err)
|
||||
}
|
||||
for _, a := range addrs {
|
||||
if ipnet, ok := a.(*net.IPNet); ok && !ipnet.IP.IsLoopback() {
|
||||
if ipnet.IP.To4() != nil {
|
||||
return "http://" + ipnet.IP.String() + httpListenAddr
|
||||
}
|
||||
}
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
|
||||
}
|
||||
// no loopback ip return internal address
|
||||
return "http://127.0.0.1" + httpListenAddr
|
||||
notifier.InitTemplateFunc(eu)
|
||||
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
|
||||
}
|
||||
nts, err := notifier.Init(aug)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: q,
|
||||
notifiers: nts,
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
|
||||
}
|
||||
manager.rw = rw
|
||||
|
||||
rr, err := remoteread.Init()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
||||
}
|
||||
manager.rr = rr
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
func (w *watchdog) stop() {
|
||||
panic("not implemented")
|
||||
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
|
||||
if externalURL != "" {
|
||||
return url.Parse(externalURL)
|
||||
}
|
||||
hname, err := os.Hostname()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
port := ""
|
||||
if ipport := strings.Split(httpListenAddr, ":"); len(ipport) > 1 {
|
||||
port = ":" + ipport[1]
|
||||
}
|
||||
schema := "http://"
|
||||
if isSecure {
|
||||
schema = "https://"
|
||||
}
|
||||
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
|
||||
}
|
||||
|
||||
func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, validateTemplate bool) (notifier.AlertURLGenerator, error) {
|
||||
if externalAlertSource == "" {
|
||||
return func(alert notifier.Alert) string {
|
||||
return fmt.Sprintf("%s/api/v1/%s/%s/status", externalURL, strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10))
|
||||
}, nil
|
||||
}
|
||||
if validateTemplate {
|
||||
if err := notifier.ValidateTemplates(map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
|
||||
}
|
||||
}
|
||||
m := map[string]string{
|
||||
"tpl": externalAlertSource,
|
||||
}
|
||||
return func(alert notifier.Alert) string {
|
||||
templated, err := alert.ExecTemplate(m)
|
||||
if err != nil {
|
||||
logger.Errorf("can not exec source template %s", err)
|
||||
}
|
||||
return fmt.Sprintf("%s/%s", externalURL, templated["tpl"])
|
||||
}, nil
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmalert processes alerts and recording rules.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
53
app/vmalert/main_test.go
Normal file
53
app/vmalert/main_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
expURL := "https://vicotriametrics.com/path"
|
||||
u, err := getExternalURL(expURL, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
h, _ := os.Hostname()
|
||||
expURL = fmt.Sprintf("https://%s:4242", h)
|
||||
u, err = getExternalURL("", "0.0.0.0:4242", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if u.String() != expURL {
|
||||
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAlertURLGenerator(t *testing.T) {
|
||||
testAlert := notifier.Alert{GroupID: 42, ID: 2, Value: 4}
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
fn, err := getAlertURLGenerator(u, "", false)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/api/v1/42/2/status"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
|
||||
if err == nil {
|
||||
t.Errorf("exptected tempalte validation error got nil")
|
||||
}
|
||||
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if exp := "https://victoriametrics.com/path/foo?query=4"; exp != fn(testAlert) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
}
|
||||
135
app/vmalert/manager.go
Normal file
135
app/vmalert/manager.go
Normal file
@@ -0,0 +1,135 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rr datasource.Querier
|
||||
|
||||
wg sync.WaitGroup
|
||||
|
||||
groupsMu sync.RWMutex
|
||||
groups map[uint64]*Group
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("can't find group with id %q", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
}
|
||||
|
||||
func (m *manager) close() {
|
||||
if m.rw != nil {
|
||||
err := m.rw.Close()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
||||
}
|
||||
}
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
m.wg.Add(1)
|
||||
id := group.ID()
|
||||
go func() {
|
||||
group.start(ctx, m.querier, m.notifiers, m.rw)
|
||||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = group
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
|
||||
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse configuration file: %w", err)
|
||||
}
|
||||
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
m.groupsMu.Lock()
|
||||
for _, og := range m.groups {
|
||||
ng, ok := groupsRegistry[og.ID()]
|
||||
if !ok {
|
||||
// old group is not present in new list
|
||||
// and must be stopped and deleted
|
||||
og.close()
|
||||
delete(m.groups, og.ID())
|
||||
og = nil
|
||||
continue
|
||||
}
|
||||
og.updateCh <- ng
|
||||
delete(groupsRegistry, ng.ID())
|
||||
}
|
||||
|
||||
for _, ng := range groupsRegistry {
|
||||
m.startGroup(ctx, ng, restore)
|
||||
}
|
||||
m.groupsMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
ag := APIGroup{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
Name: g.Name,
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
case *AlertingRule:
|
||||
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
||||
case *RecordingRule:
|
||||
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
||||
}
|
||||
}
|
||||
return ag
|
||||
}
|
||||
211
app/vmalert/manager_test.go
Normal file
211
app/vmalert/manager_test.go
Normal file
@@ -0,0 +1,211 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
notifier.InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestManagerUpdateError(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
path := []string{"foo/bar"}
|
||||
err := m.update(context.Background(), path, true, true, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to have err; got nil instead")
|
||||
}
|
||||
expErr := "no groups found"
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to got err %s; got %s", expErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestManagerUpdateConcurrent supposed to test concurrent
|
||||
// execution of configuration update.
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
"config/testdata/dir/rules0-bad.rules",
|
||||
"config/testdata/dir/rules1-good.rules",
|
||||
"config/testdata/dir/rules1-bad.rules",
|
||||
"config/testdata/rules0-good.rules",
|
||||
"config/testdata/rules1-good.rules",
|
||||
"config/testdata/rules2-good.rules",
|
||||
}
|
||||
*evaluationInterval = time.Millisecond
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
|
||||
t.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
const workers = 500
|
||||
const iterations = 10
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
_ = m.update(context.Background(), path, true, true, false)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// TestManagerUpdate tests sequential configuration
|
||||
// updates.
|
||||
func TestManagerUpdate(t *testing.T) {
|
||||
const defaultEvalInterval = time.Second * 30
|
||||
currentEvalInterval := *evaluationInterval
|
||||
*evaluationInterval = defaultEvalInterval
|
||||
defer func() {
|
||||
*evaluationInterval = currentEvalInterval
|
||||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
Labels: map[string]string{
|
||||
"label": "bar",
|
||||
"host": "{{ $labels.instance }}",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
"summary": "Too high connection number for {{$labels.instance}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
initPath string
|
||||
updatePath string
|
||||
want []*Group
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
Labels: map[string]string{"label": "bar"},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "config/testdata/dir/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules0-good.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update with one bad rule file",
|
||||
initPath: "config/testdata/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
|
||||
path := []string{tc.initPath}
|
||||
if err := m.update(ctx, path, true, true, false); err != nil {
|
||||
t.Fatalf("failed to complete initial rules update: %s", err)
|
||||
}
|
||||
|
||||
path = []string{tc.updatePath}
|
||||
_ = m.update(ctx, path, true, true, false)
|
||||
if len(tc.want) != len(m.groups) {
|
||||
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
|
||||
}
|
||||
|
||||
for _, wantG := range tc.want {
|
||||
gotG, ok := m.groups[wantG.ID()]
|
||||
if !ok {
|
||||
t.Fatalf("expected to have group %q", wantG.Name)
|
||||
}
|
||||
compareGroups(t, wantG, gotG)
|
||||
}
|
||||
|
||||
cancel()
|
||||
m.close()
|
||||
})
|
||||
}
|
||||
}
|
||||
109
app/vmalert/notifier/alert.go
Normal file
109
app/vmalert/notifier/alert.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
// Alert the triggered alert
|
||||
// TODO: Looks like alert name isn't unique
|
||||
type Alert struct {
|
||||
GroupID uint64
|
||||
Name string
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
State AlertState
|
||||
|
||||
Expr string
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Value float64
|
||||
ID uint64
|
||||
}
|
||||
|
||||
// AlertState type indicates the Alert state
|
||||
type AlertState int
|
||||
|
||||
const (
|
||||
// StateInactive is the state of an alert that is neither firing nor pending.
|
||||
StateInactive AlertState = iota
|
||||
// StatePending is the state of an alert that has been active for less than
|
||||
// the configured threshold duration.
|
||||
StatePending
|
||||
// StateFiring is the state of an alert that has been active for longer than
|
||||
// the configured threshold duration.
|
||||
StateFiring
|
||||
)
|
||||
|
||||
// String stringer for AlertState
|
||||
func (as AlertState) String() string {
|
||||
switch as {
|
||||
case StateFiring:
|
||||
return "firing"
|
||||
case StatePending:
|
||||
return "pending"
|
||||
}
|
||||
return "inactive"
|
||||
}
|
||||
|
||||
type alertTplData struct {
|
||||
Labels map[string]string
|
||||
Value float64
|
||||
Expr string
|
||||
}
|
||||
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
|
||||
|
||||
// ExecTemplate executes the Alert template for give
|
||||
// map of annotations.
|
||||
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
|
||||
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
|
||||
return templateAnnotations(annotations, tplHeader, tplData)
|
||||
}
|
||||
|
||||
// ValidateTemplates validate annotations for possible template error, uses empty data for template population
|
||||
func ValidateTemplates(annotations map[string]string) error {
|
||||
_, err := templateAnnotations(annotations, tplHeader, alertTplData{
|
||||
Labels: map[string]string{},
|
||||
Value: 0,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
|
||||
var builder strings.Builder
|
||||
var buf bytes.Buffer
|
||||
eg := new(utils.ErrGroup)
|
||||
r := make(map[string]string, len(annotations))
|
||||
for key, text := range annotations {
|
||||
r[key] = text
|
||||
buf.Reset()
|
||||
builder.Reset()
|
||||
builder.Grow(len(header) + len(text))
|
||||
builder.WriteString(header)
|
||||
builder.WriteString(text)
|
||||
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
|
||||
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
|
||||
continue
|
||||
}
|
||||
r[key] = buf.String()
|
||||
}
|
||||
return r, eg.Err()
|
||||
}
|
||||
|
||||
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
|
||||
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing annotation: %w", err)
|
||||
}
|
||||
if err = tpl.Execute(dst, data); err != nil {
|
||||
return fmt.Errorf("error evaluating annotation template: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
82
app/vmalert/notifier/alert_test.go
Normal file
82
app/vmalert/notifier/alert_test.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAlert_ExecTemplate(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
alert *Alert
|
||||
annotations map[string]string
|
||||
expTpl map[string]string
|
||||
}{
|
||||
{
|
||||
name: "empty-alert",
|
||||
alert: &Alert{},
|
||||
annotations: map[string]string{},
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "no-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
"instance": "localhost",
|
||||
},
|
||||
},
|
||||
annotations: map[string]string{},
|
||||
expTpl: map[string]string{},
|
||||
},
|
||||
{
|
||||
name: "label-template",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
"job": "staging",
|
||||
"instance": "localhost",
|
||||
},
|
||||
},
|
||||
annotations: map[string]string{
|
||||
"summary": "Too high connection number for {{$labels.instance}} for job {{$labels.job}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
expTpl: map[string]string{
|
||||
"summary": "Too high connection number for localhost for job staging",
|
||||
"description": "It is 10000 connections for localhost",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "expression-template",
|
||||
alert: &Alert{
|
||||
Expr: `vm_rows{"label"="bar"}>0`,
|
||||
},
|
||||
annotations: map[string]string{
|
||||
"exprEscapedQuery": "{{ $expr|quotesEscape|queryEscape }}",
|
||||
"exprEscapedPath": "{{ $expr|quotesEscape|pathEscape }}",
|
||||
},
|
||||
expTpl: map[string]string{
|
||||
"exprEscapedQuery": "vm_rows%7B%5C%22label%5C%22%3D%5C%22bar%5C%22%7D%3E0",
|
||||
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tpl, err := tc.alert.ExecTemplate(tc.annotations)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(tpl) != len(tc.expTpl) {
|
||||
t.Fatalf("expected %d elements; got %d", len(tc.expTpl), len(tpl))
|
||||
}
|
||||
for k := range tc.expTpl {
|
||||
got, exp := tpl[k], tc.expTpl[k]
|
||||
if got != exp {
|
||||
t.Fatalf("expected %q=%q; got %q=%q", k, exp, k, got)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
68
app/vmalert/notifier/alertmanager.go
Normal file
68
app/vmalert/notifier/alertmanager.go
Normal file
@@ -0,0 +1,68 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
alertURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
b := &bytes.Buffer{}
|
||||
writeamRequest(b, alerts, am.argFunc)
|
||||
|
||||
req, err := http.NewRequest("POST", am.alertURL, b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req = req.WithContext(ctx)
|
||||
if am.basicAuthPass != "" {
|
||||
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
|
||||
}
|
||||
resp, err := am.client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
|
||||
}
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AlertURLGenerator returns URL to single alert by given name
|
||||
type AlertURLGenerator func(Alert) string
|
||||
|
||||
const alertManagerPath = "/api/v2/alerts"
|
||||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
return &AlertManager{
|
||||
alertURL: addr,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
basicAuthUser: user,
|
||||
basicAuthPass: pass,
|
||||
}
|
||||
}
|
||||
@@ -3,19 +3,19 @@
|
||||
) %}
|
||||
{% stripspace %}
|
||||
|
||||
{% func amRequest(alerts []Alert, generatorURL func(string, string) string) %}
|
||||
{% func amRequest(alerts []Alert, generatorURL func(Alert) string) %}
|
||||
[
|
||||
{% for i, alert := range alerts %}
|
||||
{
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(alert.Group, alert.Name) %},
|
||||
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
|
||||
"generatorURL": {%q= generatorURL(alert) %},
|
||||
{% if !alert.End.IsZero() %}
|
||||
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
|
||||
{% endif %}
|
||||
"labels": {
|
||||
"alertname":{%q= alert.Name %}
|
||||
{% for _,v := range alert.Labels %}
|
||||
,{%q= v.Name %}:{%q= v.Value %}
|
||||
{% for k,v := range alert.Labels %}
|
||||
,{%q= k %}:{%q= v %}
|
||||
{% endfor %}
|
||||
},
|
||||
"annotations": {
|
||||
130
app/vmalert/notifier/alertmanager_request.qtpl.go
Normal file
130
app/vmalert/notifier/alertmanager_request.qtpl.go
Normal file
@@ -0,0 +1,130 @@
|
||||
// Code generated by qtc from "alertmanager_request.qtpl". DO NOT EDIT.
|
||||
// See https://github.com/valyala/quicktemplate for details.
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:1
|
||||
package notifier
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:1
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
import (
|
||||
qtio422016 "io"
|
||||
|
||||
qt422016 "github.com/valyala/quicktemplate"
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
var (
|
||||
_ = qtio422016.Copy
|
||||
_ = qt422016.AcquireByteBuffer
|
||||
)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:6
|
||||
qw422016.N().S(`[`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
for i, alert := range alerts {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:8
|
||||
qw422016.N().S(`{"startsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:10
|
||||
qw422016.N().S(`,"generatorURL":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().Q(generatorURL(alert))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:11
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
if !alert.End.IsZero() {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:12
|
||||
qw422016.N().S(`"endsAt":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:13
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:14
|
||||
qw422016.N().S(`"labels": {"alertname":`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:16
|
||||
qw422016.N().Q(alert.Name)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
for k, v := range alert.Labels {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:17
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:18
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:19
|
||||
qw422016.N().S(`},"annotations": {`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:22
|
||||
c := len(alert.Annotations)
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:23
|
||||
for k, v := range alert.Annotations {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:24
|
||||
c = c - 1
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
if c > 0 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:25
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:26
|
||||
qw422016.N().S(`}}`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
if i != len(alerts)-1 {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:29
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
}
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:30
|
||||
qw422016.N().S(`]`)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
streamamRequest(qw422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseWriter(qw422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
func amRequest(alerts []Alert, generatorURL func(Alert) string) string {
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
writeamRequest(qb422016, alerts, generatorURL)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qs422016 := string(qb422016.B)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
qt422016.ReleaseByteBuffer(qb422016)
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
return qs422016
|
||||
//line app/vmalert/notifier/alertmanager_request.qtpl:32
|
||||
}
|
||||
@@ -1,20 +1,31 @@
|
||||
package provider
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAlertManager_Send(t *testing.T) {
|
||||
const baUser, baPass = "foo", "bar"
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Errorf("should not be called")
|
||||
})
|
||||
c := -1
|
||||
mux.HandleFunc(alertsPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
user, pass, ok := r.BasicAuth()
|
||||
if !ok {
|
||||
t.Errorf("unauthorized request")
|
||||
}
|
||||
if user != baUser || pass != baPass {
|
||||
t.Errorf("wrong creds %q:%q; expected %q:%q",
|
||||
user, pass, baUser, baPass)
|
||||
}
|
||||
c++
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("expected POST method got %s", r.Method)
|
||||
@@ -40,8 +51,8 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
if len(a) != 1 {
|
||||
t.Errorf("expected 1 alert in array got %d", len(a))
|
||||
}
|
||||
if a[0].GeneratorURL != "group0alert0" {
|
||||
t.Errorf("exptected alert0 as generatorURL got %s", a[0].GeneratorURL)
|
||||
if a[0].GeneratorURL != "0/0" {
|
||||
t.Errorf("exptected 0/0 as generatorURL got %s", a[0].GeneratorURL)
|
||||
}
|
||||
if a[0].Labels["alertname"] != "alert0" {
|
||||
t.Errorf("exptected alert0 as alert name got %s", a[0].Labels["alertname"])
|
||||
@@ -56,17 +67,17 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
am := NewAlertManager(srv.URL, func(group, name string) string {
|
||||
return group + name
|
||||
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
|
||||
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
|
||||
}, srv.Client())
|
||||
if err := am.Send([]Alert{{}, {}}); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
|
||||
t.Error("expected connection error got nil")
|
||||
}
|
||||
if err := am.Send([]Alert{}); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{}); err == nil {
|
||||
t.Error("expected wrong http code error got nil")
|
||||
}
|
||||
if err := am.Send([]Alert{{
|
||||
Group: "group0",
|
||||
if err := am.Send(context.Background(), []Alert{{
|
||||
GroupID: 0,
|
||||
Name: "alert0",
|
||||
Start: time.Now().UTC(),
|
||||
End: time.Now().UTC(),
|
||||
47
app/vmalert/notifier/init.go
Normal file
47
app/vmalert/notifier/init.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
)
|
||||
|
||||
var (
|
||||
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
||||
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
|
||||
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
|
||||
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
|
||||
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
|
||||
"By default the server name from -notifier.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Notifier object based on provided flags.
|
||||
func Init(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
if len(*addrs) == 0 {
|
||||
flag.PrintDefaults()
|
||||
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
|
||||
}
|
||||
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
|
||||
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
|
||||
tr, err := utils.Transport(addr, cert, key, ca, serverName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
|
||||
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
|
||||
notifiers = append(notifiers, am)
|
||||
}
|
||||
|
||||
return notifiers, nil
|
||||
}
|
||||
8
app/vmalert/notifier/notifier.go
Normal file
8
app/vmalert/notifier/notifier.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package notifier
|
||||
|
||||
import "context"
|
||||
|
||||
// Notifier is common interface for alert manager provider
|
||||
type Notifier interface {
|
||||
Send(ctx context.Context, alerts []Alert) error
|
||||
}
|
||||
13
app/vmalert/notifier/package_test.go
Normal file
13
app/vmalert/notifier/package_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
u, _ := url.Parse("https://victoriametrics.com/path")
|
||||
InitTemplateFunc(u)
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
180
app/vmalert/notifier/template_func.go
Normal file
180
app/vmalert/notifier/template_func.go
Normal file
@@ -0,0 +1,180 @@
|
||||
// Copyright 2013 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
html_template "html/template"
|
||||
"math"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
text_template "text/template"
|
||||
"time"
|
||||
)
|
||||
|
||||
var tmplFunc text_template.FuncMap
|
||||
|
||||
// InitTemplateFunc returns template helper functions
|
||||
func InitTemplateFunc(externalURL *url.URL) {
|
||||
tmplFunc = text_template.FuncMap{
|
||||
"args": func(args ...interface{}) map[string]interface{} {
|
||||
result := make(map[string]interface{})
|
||||
for i, a := range args {
|
||||
result[fmt.Sprintf("arg%d", i)] = a
|
||||
}
|
||||
return result
|
||||
},
|
||||
"reReplaceAll": func(pattern, repl, text string) string {
|
||||
re := regexp.MustCompile(pattern)
|
||||
return re.ReplaceAllString(text, repl)
|
||||
},
|
||||
"safeHtml": func(text string) html_template.HTML {
|
||||
return html_template.HTML(text)
|
||||
},
|
||||
"match": regexp.MatchString,
|
||||
"title": strings.Title,
|
||||
"toUpper": strings.ToUpper,
|
||||
"toLower": strings.ToLower,
|
||||
"humanize": func(v float64) string {
|
||||
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
}
|
||||
if math.Abs(v) >= 1 {
|
||||
prefix := ""
|
||||
for _, p := range []string{"k", "M", "G", "T", "P", "E", "Z", "Y"} {
|
||||
if math.Abs(v) < 1000 {
|
||||
break
|
||||
}
|
||||
prefix = p
|
||||
v /= 1000
|
||||
}
|
||||
return fmt.Sprintf("%.4g%s", v, prefix)
|
||||
}
|
||||
prefix := ""
|
||||
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
|
||||
if math.Abs(v) >= 1 {
|
||||
break
|
||||
}
|
||||
prefix = p
|
||||
v *= 1000
|
||||
}
|
||||
return fmt.Sprintf("%.4g%s", v, prefix)
|
||||
},
|
||||
"humanize1024": func(v float64) string {
|
||||
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
}
|
||||
prefix := ""
|
||||
for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} {
|
||||
if math.Abs(v) < 1024 {
|
||||
break
|
||||
}
|
||||
prefix = p
|
||||
v /= 1024
|
||||
}
|
||||
return fmt.Sprintf("%.4g%s", v, prefix)
|
||||
},
|
||||
"humanizeDuration": func(v float64) string {
|
||||
if math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
}
|
||||
if v == 0 {
|
||||
return fmt.Sprintf("%.4gs", v)
|
||||
}
|
||||
if math.Abs(v) >= 1 {
|
||||
sign := ""
|
||||
if v < 0 {
|
||||
sign = "-"
|
||||
v = -v
|
||||
}
|
||||
seconds := int64(v) % 60
|
||||
minutes := (int64(v) / 60) % 60
|
||||
hours := (int64(v) / 60 / 60) % 24
|
||||
days := int64(v) / 60 / 60 / 24
|
||||
// For days to minutes, we display seconds as an integer.
|
||||
if days != 0 {
|
||||
return fmt.Sprintf("%s%dd %dh %dm %ds", sign, days, hours, minutes, seconds)
|
||||
}
|
||||
if hours != 0 {
|
||||
return fmt.Sprintf("%s%dh %dm %ds", sign, hours, minutes, seconds)
|
||||
}
|
||||
if minutes != 0 {
|
||||
return fmt.Sprintf("%s%dm %ds", sign, minutes, seconds)
|
||||
}
|
||||
// For seconds, we display 4 significant digits.
|
||||
return fmt.Sprintf("%s%.4gs", sign, v)
|
||||
}
|
||||
prefix := ""
|
||||
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
|
||||
if math.Abs(v) >= 1 {
|
||||
break
|
||||
}
|
||||
prefix = p
|
||||
v *= 1000
|
||||
}
|
||||
return fmt.Sprintf("%.4g%ss", v, prefix)
|
||||
},
|
||||
"humanizePercentage": func(v float64) string {
|
||||
return fmt.Sprintf("%.4g%%", v*100)
|
||||
},
|
||||
"humanizeTimestamp": func(v float64) string {
|
||||
if math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
}
|
||||
t := TimeFromUnixNano(int64(v * 1e9)).Time().UTC()
|
||||
return fmt.Sprint(t)
|
||||
},
|
||||
"pathPrefix": func() string {
|
||||
return externalURL.Path
|
||||
},
|
||||
"externalURL": func() string {
|
||||
return externalURL.String()
|
||||
},
|
||||
"pathEscape": func(u string) string {
|
||||
return url.PathEscape(u)
|
||||
},
|
||||
"queryEscape": func(q string) string {
|
||||
return url.QueryEscape(q)
|
||||
},
|
||||
"quotesEscape": func(q string) string {
|
||||
return strings.Replace(q, `"`, `\"`, -1)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Time is the number of milliseconds since the epoch
|
||||
// (1970-01-01 00:00 UTC) excluding leap seconds.
|
||||
type Time int64
|
||||
|
||||
// TimeFromUnixNano returns the Time equivalent to the Unix Time
|
||||
// t provided in nanoseconds.
|
||||
func TimeFromUnixNano(t int64) Time {
|
||||
return Time(t / nanosPerTick)
|
||||
}
|
||||
|
||||
// The number of nanoseconds per minimum tick.
|
||||
const nanosPerTick = int64(minimumTick / time.Nanosecond)
|
||||
|
||||
// MinimumTick is the minimum supported time resolution. This has to be
|
||||
// at least time.Second in order for the code below to work.
|
||||
const minimumTick = time.Millisecond
|
||||
|
||||
// second is the Time duration equivalent to one second.
|
||||
const second = int64(time.Second / minimumTick)
|
||||
|
||||
// Time returns the time.Time representation of t.
|
||||
func (t Time) Time() time.Time {
|
||||
return time.Unix(int64(t)/second, (int64(t)%second)*nanosPerTick)
|
||||
}
|
||||
@@ -1,130 +0,0 @@
|
||||
// Code generated by qtc from "alert_manager_request.qtpl". DO NOT EDIT.
|
||||
// See https://github.com/valyala/quicktemplate for details.
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:1
|
||||
package provider
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:1
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:6
|
||||
import (
|
||||
qtio422016 "io"
|
||||
|
||||
qt422016 "github.com/valyala/quicktemplate"
|
||||
)
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:6
|
||||
var (
|
||||
_ = qtio422016.Copy
|
||||
_ = qt422016.AcquireByteBuffer
|
||||
)
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:6
|
||||
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:6
|
||||
qw422016.N().S(`[`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:8
|
||||
for i, alert := range alerts {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:8
|
||||
qw422016.N().S(`{"startsAt":`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:10
|
||||
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:10
|
||||
qw422016.N().S(`,"generatorURL":`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:11
|
||||
qw422016.N().Q(generatorURL(alert.Group, alert.Name))
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:11
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:12
|
||||
if !alert.End.IsZero() {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:12
|
||||
qw422016.N().S(`"endsAt":`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:13
|
||||
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:13
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:14
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:14
|
||||
qw422016.N().S(`"labels": {"alertname":`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:16
|
||||
qw422016.N().Q(alert.Name)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:17
|
||||
for _, v := range alert.Labels {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:17
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:18
|
||||
qw422016.N().Q(v.Name)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:18
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:18
|
||||
qw422016.N().Q(v.Value)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:19
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:19
|
||||
qw422016.N().S(`},"annotations": {`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:22
|
||||
c := len(alert.Annotations)
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:23
|
||||
for k, v := range alert.Annotations {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:24
|
||||
c = c - 1
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
qw422016.N().Q(k)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
qw422016.N().S(`:`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
qw422016.N().Q(v)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
if c > 0 {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:25
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:26
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:26
|
||||
qw422016.N().S(`}}`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:29
|
||||
if i != len(alerts)-1 {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:29
|
||||
qw422016.N().S(`,`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:29
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:30
|
||||
}
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:30
|
||||
qw422016.N().S(`]`)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
streamamRequest(qw422016, alerts, generatorURL)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
qt422016.ReleaseWriter(qw422016)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
}
|
||||
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
func amRequest(alerts []Alert, generatorURL func(string, string) string) string {
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
writeamRequest(qb422016, alerts, generatorURL)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
qs422016 := string(qb422016.B)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
qt422016.ReleaseByteBuffer(qb422016)
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
return qs422016
|
||||
//line app/vmalert/provider/alert_manager_request.qtpl:32
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
package provider
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
const alertsPath = "/api/v2/alerts"
|
||||
|
||||
var pool = sync.Pool{New: func() interface{} {
|
||||
return &bytes.Buffer{}
|
||||
}}
|
||||
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
type AlertManager struct {
|
||||
alertURL string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// AlertURLGenerator returns URL to single alert by given name
|
||||
type AlertURLGenerator func(group, name string) string
|
||||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
return &AlertManager{
|
||||
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertsPath,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
}
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(alerts []Alert) error {
|
||||
b := pool.Get().(*bytes.Buffer)
|
||||
b.Reset()
|
||||
defer pool.Put(b)
|
||||
writeamRequest(b, alerts, am.argFunc)
|
||||
resp, err := am.client.Post(am.alertURL, "application/json", b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b.Reset()
|
||||
if _, err := io.Copy(b, resp.Body); err != nil {
|
||||
logger.Errorf("unable to copy error response body to buffer %s", err)
|
||||
}
|
||||
return fmt.Errorf("invalid response from alertmanager %s", b)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,65 +0,0 @@
|
||||
package provider
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
// AlertProvider is common interface for alert manager provider
|
||||
type AlertProvider interface {
|
||||
Send(alerts []Alert) error
|
||||
}
|
||||
|
||||
// Alert the triggered alert
|
||||
type Alert struct {
|
||||
Group string
|
||||
Name string
|
||||
Labels []datasource.Label
|
||||
Annotations map[string]string
|
||||
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Value float64
|
||||
}
|
||||
|
||||
// AlertsFromMetrics converts metrics to alerts by alert Rule
|
||||
func AlertsFromMetrics(metrics []datasource.Metric, group string, rule config.Rule) []Alert {
|
||||
alerts := make([]Alert, 0, len(metrics))
|
||||
for i, m := range metrics {
|
||||
a := Alert{
|
||||
Group: group,
|
||||
Name: rule.Name,
|
||||
Labels: metrics[i].Labels,
|
||||
// todo eval template in annotations
|
||||
Annotations: rule.Annotations,
|
||||
Start: time.Unix(m.Timestamp, 0),
|
||||
}
|
||||
for k, v := range rule.Labels {
|
||||
a.Labels = append(a.Labels, datasource.Label{
|
||||
Name: k,
|
||||
Value: v,
|
||||
})
|
||||
}
|
||||
a.Labels = removeDuplicated(a.Labels)
|
||||
alerts = append(alerts, a)
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
func removeDuplicated(l []datasource.Label) []datasource.Label {
|
||||
sort.Slice(l, func(i, j int) bool {
|
||||
return l[i].Name < l[j].Name
|
||||
})
|
||||
j := 0
|
||||
for i := 1; i < len(l); i++ {
|
||||
if l[j] == l[i] {
|
||||
continue
|
||||
}
|
||||
j++
|
||||
l[j] = l[i]
|
||||
}
|
||||
return l[:j+1]
|
||||
}
|
||||
149
app/vmalert/recording.go
Normal file
149
app/vmalert/recording.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// RecordingRule is a Rule that supposed
|
||||
// to evaluate configured Expression and
|
||||
// return TimeSeries as result.
|
||||
type RecordingRule struct {
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
Labels map[string]string
|
||||
GroupID uint64
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
func (rr *RecordingRule) String() string {
|
||||
return rr.Name
|
||||
}
|
||||
|
||||
// ID returns unique Rule ID
|
||||
// within the parent Group.
|
||||
func (rr *RecordingRule) ID() uint64 {
|
||||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
|
||||
return &RecordingRule{
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Record,
|
||||
Expr: cfg.Expr,
|
||||
Labels: cfg.Labels,
|
||||
GroupID: gID,
|
||||
}
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
if !series {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
qMetrics, err := q.Query(ctx, rr.Expr)
|
||||
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
rr.lastExecTime = time.Now()
|
||||
rr.lastExecError = err
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
}
|
||||
|
||||
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, r := range qMetrics {
|
||||
ts := rr.toTimeSeries(r, rr.lastExecTime)
|
||||
h := hashTimeSeries(ts)
|
||||
if _, ok := duplicates[h]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, errDuplicate
|
||||
}
|
||||
duplicates[h] = ts
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := ts.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
labels[l.Name] = l.Value
|
||||
}
|
||||
labels["__name__"] = rr.Name
|
||||
// override existing labels with configured ones
|
||||
for k, v := range rr.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Value, labels, timestamp)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
return nil
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIRecordingRule
|
||||
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
||||
var lastErr string
|
||||
if rr.lastExecError != nil {
|
||||
lastErr = rr.lastExecError.Error()
|
||||
}
|
||||
return APIRecordingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
Name: rr.Name,
|
||||
Expression: rr.Expr,
|
||||
LastError: lastErr,
|
||||
LastExec: rr.lastExecTime,
|
||||
Labels: rr.Labels,
|
||||
}
|
||||
}
|
||||
121
app/vmalert/recording_test.go
Normal file
121
app/vmalert/recording_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
metrics []datasource.Metric
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
[]datasource.Metric{metricWithValueAndLabels(t, 10,
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(10, map[string]string{
|
||||
"__name__": "foo",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
|
||||
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}, timestamp),
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}, timestamp),
|
||||
newTimeSeries(3, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(2, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
newTimeSeries(1, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tss, err := tc.rule.Exec(context.TODO(), fq, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
|
||||
_, err := rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), fq, true)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), errDuplicate.Error()) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
|
||||
}
|
||||
}
|
||||
39
app/vmalert/remoteread/init.go
Normal file
39
app/vmalert/remoteread/init.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package remoteread
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
|
||||
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
|
||||
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
|
||||
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
|
||||
"By default the server name from -remoteRead.url is used")
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init() (datasource.Querier, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &http.Client{Transport: tr}
|
||||
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, c), nil
|
||||
}
|
||||
54
app/vmalert/remotewrite/init.go
Normal file
54
app/vmalert/remotewrite/init.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
|
||||
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
|
||||
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
|
||||
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
|
||||
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used")
|
||||
)
|
||||
|
||||
// Init creates Client object from given flags.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init(ctx context.Context) (*Client, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
|
||||
return NewClient(ctx, Config{
|
||||
Addr: *addr,
|
||||
Concurrency: *concurrency,
|
||||
MaxQueueSize: *maxQueueSize,
|
||||
MaxBatchSize: *maxBatchSize,
|
||||
FlushInterval: *flushInterval,
|
||||
BasicAuthUser: *basicAuthUsername,
|
||||
BasicAuthPass: *basicAuthPassword,
|
||||
Transport: t,
|
||||
})
|
||||
}
|
||||
206
app/vmalert/remotewrite/remotewrite.go
Normal file
206
app/vmalert/remotewrite/remotewrite.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
input chan prompbmarshal.TimeSeries
|
||||
baUser, baPass string
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
|
||||
BasicAuthUser string
|
||||
BasicAuthPass string
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// WriteTimeout defines timeout for HTTP write request
|
||||
// to remote storage
|
||||
WriteTimeout time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
const writePath = "/api/v1/write"
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.WriteTimeout == 0 {
|
||||
cfg.WriteTimeout = defaultWriteTimeout
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: cfg.WriteTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
|
||||
baUser: cfg.BasicAuthUser,
|
||||
baPass: cfg.BasicAuthPass,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
wr = prompbmarshal.WriteRequest{}
|
||||
case ts := <-c.input:
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
wr = prompbmarshal.WriteRequest{}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (c *Client) flush(ctx context.Context, wr prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
req, err := http.NewRequest("POST", c.addr, bytes.NewReader(snappy.Encode(nil, data)))
|
||||
if err != nil {
|
||||
logger.Errorf("failed to create new HTTP request: %s", err)
|
||||
return
|
||||
}
|
||||
if c.baPass != "" {
|
||||
req.SetBasicAuth(c.baUser, c.baPass)
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
logger.Errorf("error getting response from %s:%s", req.URL, err)
|
||||
return
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusNoContent {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
logger.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
|
||||
return
|
||||
}
|
||||
}
|
||||
24
app/vmalert/rule.go
Normal file
24
app/vmalert/rule.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// Returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
// and Querier. If returnSeries is true, Exec
|
||||
// may return TimeSeries as result of execution
|
||||
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
package storage
|
||||
28
app/vmalert/utils.go
Normal file
28
app/vmalert/utils.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
Value: labels[key],
|
||||
})
|
||||
}
|
||||
return ts
|
||||
}
|
||||
43
app/vmalert/utils/err_group.go
Normal file
43
app/vmalert/utils/err_group.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ErrGroup accumulates multiple errors
|
||||
// and produces single error message.
|
||||
type ErrGroup struct {
|
||||
errs []error
|
||||
}
|
||||
|
||||
// Add adds a new error to group.
|
||||
// Isn't thread-safe.
|
||||
func (eg *ErrGroup) Add(err error) {
|
||||
eg.errs = append(eg.errs, err)
|
||||
}
|
||||
|
||||
// Err checks if group contains at least
|
||||
// one error.
|
||||
func (eg *ErrGroup) Err() error {
|
||||
if eg == nil || len(eg.errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return eg
|
||||
}
|
||||
|
||||
// Error satisfies Error interface
|
||||
func (eg *ErrGroup) Error() string {
|
||||
if len(eg.errs) == 0 {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
|
||||
for i, err := range eg.errs {
|
||||
b.WriteString(err.Error())
|
||||
if i != len(eg.errs)-1 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
38
app/vmalert/utils/err_group_test.go
Normal file
38
app/vmalert/utils/err_group_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestErrGroup(t *testing.T) {
|
||||
testCases := []struct {
|
||||
errs []error
|
||||
exp string
|
||||
}{
|
||||
{nil, ""},
|
||||
{[]error{errors.New("timeout")}, "errors(1): timeout"},
|
||||
{
|
||||
[]error{errors.New("timeout"), errors.New("deadline")},
|
||||
"errors(2): timeout\ndeadline",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
eg := new(ErrGroup)
|
||||
for _, err := range tc.errs {
|
||||
eg.Add(err)
|
||||
}
|
||||
if len(tc.errs) == 0 {
|
||||
if eg.Err() != nil {
|
||||
t.Fatalf("expected to get nil error")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if eg.Err() == nil {
|
||||
t.Fatalf("expected to get non-nil error")
|
||||
}
|
||||
if eg.Error() != tc.exp {
|
||||
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
58
app/vmalert/utils/tls.go
Normal file
58
app/vmalert/utils/tls.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Transport creates http.Transport object based on provided URL.
|
||||
// Returns Transport with TLS configuration if URL contains `https` prefix
|
||||
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
|
||||
t := http.DefaultTransport.(*http.Transport).Clone()
|
||||
if !strings.HasPrefix(URL, "https") {
|
||||
return t, nil
|
||||
}
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t.TLSClientConfig = tlsCfg
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// TLSConfig creates tls.Config object from provided arguments
|
||||
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
|
||||
var certs []tls.Certificate
|
||||
if certFile != "" {
|
||||
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
|
||||
}
|
||||
|
||||
certs = []tls.Certificate{cert}
|
||||
}
|
||||
|
||||
var rootCAs *x509.CertPool
|
||||
if CAFile != "" {
|
||||
pem, err := ioutil.ReadFile(CAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
|
||||
}
|
||||
|
||||
rootCAs = x509.NewCertPool()
|
||||
if !rootCAs.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
|
||||
}
|
||||
}
|
||||
|
||||
return &tls.Config{
|
||||
Certificates: certs,
|
||||
InsecureSkipVerify: insecureSkipVerify,
|
||||
RootCAs: rootCAs,
|
||||
ServerName: serverName,
|
||||
}, nil
|
||||
}
|
||||
52
app/vmalert/utils/tls_test.go
Normal file
52
app/vmalert/utils/tls_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package utils
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestTLSConfig(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
serverName = "test"
|
||||
insecureSkipVerify = true
|
||||
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tlsCfg == nil {
|
||||
t.Errorf("expected tlsConfig to be set, got nil")
|
||||
}
|
||||
if tlsCfg.ServerName != serverName {
|
||||
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
|
||||
}
|
||||
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
|
||||
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
|
||||
}
|
||||
certFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected keypair error, got nil")
|
||||
}
|
||||
certFile = ""
|
||||
CAFile = "/path/to/nonexisting/cert/file"
|
||||
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err == nil {
|
||||
t.Errorf("expected read error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransport(t *testing.T) {
|
||||
var certFile, keyFile, CAFile, serverName string
|
||||
var insecureSkipVerify bool
|
||||
URL := "http://victoriametrics.com"
|
||||
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
URL = "https://victoriametrics.com"
|
||||
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error %s", err)
|
||||
}
|
||||
if tr.TLSClientConfig == nil {
|
||||
t.Errorf("expected TLSClientConfig to be set, got nil")
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 25 KiB |
180
app/vmalert/web.go
Normal file
180
app/vmalert/web.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
type requestHandler struct {
|
||||
m *manager
|
||||
}
|
||||
|
||||
var pathList = [][]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
// /metrics is served by httpserver by default
|
||||
{"/metrics", "list of application metrics"},
|
||||
{"/-/reload", "reload configuration"},
|
||||
}
|
||||
|
||||
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
resph := responseHandler{w}
|
||||
switch r.URL.Path {
|
||||
case "/":
|
||||
for _, path := range pathList {
|
||||
p, doc := path[0], path[1]
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
}
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
resph.handle(rh.listGroups())
|
||||
return true
|
||||
case "/api/v1/alerts":
|
||||
resph.handle(rh.listAlerts())
|
||||
return true
|
||||
case "/-/reload":
|
||||
logger.Infof("api config reload was called, sending sighup")
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
default:
|
||||
// /api/v1/<groupName>/<alertID>/status
|
||||
if strings.HasSuffix(r.URL.Path, "/status") {
|
||||
resph.handle(rh.alert(r.URL.Path))
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
type listGroupsResponse struct {
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listGroupsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
sort.Slice(lr.Data.Groups, func(i, j int) bool {
|
||||
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
|
||||
})
|
||||
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
type listAlertsResponse struct {
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listAlertsResponse{Status: "success"}
|
||||
for _, g := range rh.m.groups {
|
||||
for _, r := range g.Rules {
|
||||
a, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
|
||||
}
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
sort.Slice(lr.Data.Alerts, func(i, j int) bool {
|
||||
return lr.Data.Alerts[i].ID < lr.Data.Alerts[j].ID
|
||||
})
|
||||
|
||||
b, err := json.Marshal(lr)
|
||||
if err != nil {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
|
||||
StatusCode: http.StatusInternalServerError,
|
||||
}
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (rh *requestHandler) alert(path string) ([]byte, error) {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
parts := strings.SplitN(strings.TrimPrefix(path, "/api/v1/"), "/", 3)
|
||||
if len(parts) != 3 {
|
||||
return nil, &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf(`path %q cointains /status suffix but doesn't match pattern "/group/alert/status"`, path),
|
||||
StatusCode: http.StatusBadRequest,
|
||||
}
|
||||
}
|
||||
|
||||
groupID, err := uint64FromPath(parts[0])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
|
||||
}
|
||||
alertID, err := uint64FromPath(parts[1])
|
||||
if err != nil {
|
||||
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
|
||||
}
|
||||
resp, err := rh.m.AlertAPI(groupID, alertID)
|
||||
if err != nil {
|
||||
return nil, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
return json.Marshal(resp)
|
||||
}
|
||||
|
||||
// responseHandler wrapper on http.ResponseWriter with sugar
|
||||
type responseHandler struct{ http.ResponseWriter }
|
||||
|
||||
func (w responseHandler) handle(b []byte, err error) {
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, "%s", err)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write(b)
|
||||
}
|
||||
|
||||
func uint64FromPath(path string) (uint64, error) {
|
||||
s := strings.TrimRight(path, "/")
|
||||
return strconv.ParseUint(s, 10, 0)
|
||||
}
|
||||
|
||||
func badRequest(err error) *httpserver.ErrorWithStatusCode {
|
||||
return errResponse(err, http.StatusBadRequest)
|
||||
}
|
||||
|
||||
func errResponse(err error, sc int) *httpserver.ErrorWithStatusCode {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: err,
|
||||
StatusCode: sc,
|
||||
}
|
||||
}
|
||||
81
app/vmalert/web_test.go
Normal file
81
app/vmalert/web_test.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
func TestHandler(t *testing.T) {
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {},
|
||||
},
|
||||
}
|
||||
g := &Group{
|
||||
Name: "group",
|
||||
Rules: []Rule{ar},
|
||||
}
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m.groups[0] = g
|
||||
rh := &requestHandler{m: m}
|
||||
|
||||
getResp := func(url string, to interface{}, code int) {
|
||||
t.Helper()
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected err %s", err)
|
||||
}
|
||||
if code != resp.StatusCode {
|
||||
t.Errorf("unexpected status code %d want %d", resp.StatusCode, code)
|
||||
}
|
||||
defer func() {
|
||||
if err := resp.Body.Close(); err != nil {
|
||||
t.Errorf("err closing body %s", err)
|
||||
}
|
||||
}()
|
||||
if to != nil {
|
||||
if err = json.NewDecoder(resp.Body).Decode(to); err != nil {
|
||||
t.Errorf("unexpected err %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rh.handler(w, r) }))
|
||||
defer ts.Close()
|
||||
t.Run("/api/v1/alerts", func(t *testing.T) {
|
||||
lr := listAlertsResponse{}
|
||||
getResp(ts.URL+"/api/v1/alerts", &lr, 200)
|
||||
if length := len(lr.Data.Alerts); length != 1 {
|
||||
t.Errorf("expected 1 alert got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/groups", func(t *testing.T) {
|
||||
lr := listGroupsResponse{}
|
||||
getResp(ts.URL+"/api/v1/groups", &lr, 200)
|
||||
if length := len(lr.Data.Groups); length != 1 {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/0/status", func(t *testing.T) {
|
||||
alert := &APIAlert{}
|
||||
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/0/1/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/0/1/status", nil, 404)
|
||||
})
|
||||
t.Run("/api/v1/1/0/status", func(t *testing.T) {
|
||||
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
|
||||
})
|
||||
t.Run("/", func(t *testing.T) {
|
||||
getResp(ts.URL, nil, 200)
|
||||
})
|
||||
}
|
||||
54
app/vmalert/web_types.go
Normal file
54
app/vmalert/web_types.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.AlertingRule state
|
||||
// for WEB view
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
type APIAlertingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
For string `json:"for"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
}
|
||||
|
||||
// APIRecordingRule represents RecordingRule for WEB view
|
||||
type APIRecordingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
LastError string `json:"last_error"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
}
|
||||
76
app/vmauth/Makefile
Normal file
76
app/vmauth/Makefile
Normal file
@@ -0,0 +1,76 @@
|
||||
# All these commands must run from repository root.
|
||||
|
||||
vmauth:
|
||||
APP_NAME=vmauth $(MAKE) app-local
|
||||
|
||||
vmauth-race:
|
||||
APP_NAME=vmauth RACE=-race $(MAKE) app-local
|
||||
|
||||
vmauth-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker
|
||||
|
||||
vmauth-pure-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-pure
|
||||
|
||||
vmauth-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-amd64
|
||||
|
||||
vmauth-arm-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-arm
|
||||
|
||||
vmauth-arm64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-arm64
|
||||
|
||||
vmauth-ppc64le-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-ppc64le
|
||||
|
||||
vmauth-386-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-386
|
||||
|
||||
package-vmauth:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker
|
||||
|
||||
package-vmauth-pure:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-pure
|
||||
|
||||
package-vmauth-amd64:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-amd64
|
||||
|
||||
package-vmauth-arm:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-arm
|
||||
|
||||
package-vmauth-arm64:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-arm64
|
||||
|
||||
package-vmauth-ppc64le:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-ppc64le
|
||||
|
||||
package-vmauth-386:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker-386
|
||||
|
||||
publish-vmauth:
|
||||
APP_NAME=vmauth $(MAKE) publish-via-docker
|
||||
|
||||
run-vmauth:
|
||||
APP_NAME=vmauth \
|
||||
DOCKER_OPTS='-v $(shell pwd)/app/vmauth/:/app/vmauth' \
|
||||
ARGS='-auth.config=app/vmauth/example_config.yml' \
|
||||
$(MAKE) run-via-docker
|
||||
|
||||
vmauth-amd64:
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-amd64 ./app/vmauth
|
||||
|
||||
vmauth-arm:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm ./app/vmauth
|
||||
|
||||
vmauth-arm64:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm64 ./app/vmauth
|
||||
|
||||
vmauth-ppc64le:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-ppc64le ./app/vmauth
|
||||
|
||||
vmauth-386:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-386 ./app/vmauth
|
||||
|
||||
vmauth-pure:
|
||||
APP_NAME=vmauth $(MAKE) app-local-pure
|
||||
139
app/vmauth/README.md
Normal file
139
app/vmauth/README.md
Normal file
@@ -0,0 +1,139 @@
|
||||
## vmauth
|
||||
|
||||
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
|
||||
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
|
||||
|
||||
### Quick start
|
||||
|
||||
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
|
||||
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
|
||||
|
||||
```
|
||||
/path/to/vmauth -auth.config=/path/to/auth/config.yml
|
||||
```
|
||||
|
||||
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
|
||||
The port can be modified via `-httpListenAddr` command-line flag.
|
||||
|
||||
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
|
||||
|
||||
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
|
||||
|
||||
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
|
||||
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
|
||||
|
||||
|
||||
### Auth config
|
||||
|
||||
Auth config is represented in the following simple `yml` format:
|
||||
|
||||
```yml
|
||||
|
||||
# Arbitrary number of usernames may be put here.
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
```
|
||||
|
||||
|
||||
### Security
|
||||
|
||||
Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable https. This can be done by passing the following `-tls*` command-line flags to `vmauth`:
|
||||
|
||||
```
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
```
|
||||
|
||||
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
|
||||
|
||||
|
||||
### Monitoring
|
||||
|
||||
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
|
||||
either via [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or via Prometheus, so the exported metrics could be analyzed later.
|
||||
|
||||
|
||||
### How to build from sources
|
||||
|
||||
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmauth` from the root folder of the repository.
|
||||
It builds `vmauth` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmauth-prod` from the root folder of the repository.
|
||||
It builds `vmauth-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Building docker images
|
||||
|
||||
Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker image locally.
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
|
||||
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-vmauth
|
||||
```
|
||||
|
||||
|
||||
### Profiling
|
||||
|
||||
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
129
app/vmauth/auth_config.go
Normal file
129
app/vmauth/auth_config.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
var (
|
||||
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md "+
|
||||
"for details on the format of this auth config")
|
||||
)
|
||||
|
||||
// AuthConfig represents auth config.
|
||||
type AuthConfig struct {
|
||||
Users []UserInfo `yaml:"users"`
|
||||
}
|
||||
|
||||
// UserInfo is user information read from authConfigPath
|
||||
type UserInfo struct {
|
||||
Username string `yaml:"username"`
|
||||
Password string `yaml:"password"`
|
||||
URLPrefix string `yaml:"url_prefix"`
|
||||
|
||||
requests *metrics.Counter
|
||||
}
|
||||
|
||||
func initAuthConfig() {
|
||||
if len(*authConfigPath) == 0 {
|
||||
logger.Fatalf("missing required `-auth.config` command-line flag")
|
||||
}
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
}
|
||||
authConfig.Store(m)
|
||||
stopCh = make(chan struct{})
|
||||
authConfigWG.Add(1)
|
||||
go func() {
|
||||
defer authConfigWG.Done()
|
||||
authConfigReloader()
|
||||
}()
|
||||
}
|
||||
|
||||
func stopAuthConfig() {
|
||||
close(stopCh)
|
||||
authConfigWG.Wait()
|
||||
}
|
||||
|
||||
func authConfigReloader() {
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-sighupCh:
|
||||
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
|
||||
continue
|
||||
}
|
||||
authConfig.Store(m)
|
||||
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var authConfig atomic.Value
|
||||
var authConfigWG sync.WaitGroup
|
||||
var stopCh chan struct{}
|
||||
|
||||
func readAuthConfig(path string) (map[string]*UserInfo, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read %q: %w", path, err)
|
||||
}
|
||||
m, err := parseAuthConfig(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
|
||||
}
|
||||
logger.Infof("Loaded information about %d users from %q", len(m), path)
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
var ac AuthConfig
|
||||
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
|
||||
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
|
||||
}
|
||||
uis := ac.Users
|
||||
if len(uis) == 0 {
|
||||
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
|
||||
}
|
||||
m := make(map[string]*UserInfo, len(uis))
|
||||
for i := range uis {
|
||||
ui := &uis[i]
|
||||
if m[ui.Username] != nil {
|
||||
return nil, fmt.Errorf("duplicate username found; username: %q", ui.Username)
|
||||
}
|
||||
urlPrefix := ui.URLPrefix
|
||||
// Remove trailing '/' from urlPrefix
|
||||
for strings.HasSuffix(urlPrefix, "/") {
|
||||
urlPrefix = urlPrefix[:len(urlPrefix)-1]
|
||||
}
|
||||
// Validate urlPrefix
|
||||
target, err := url.Parse(urlPrefix)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
|
||||
}
|
||||
if target.Scheme != "http" && target.Scheme != "https" {
|
||||
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
|
||||
}
|
||||
|
||||
ui.URLPrefix = urlPrefix
|
||||
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
|
||||
m[ui.Username] = ui
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
112
app/vmauth/auth_config_test.go
Normal file
112
app/vmauth/auth_config_test.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseAuthConfigFailure(t *testing.T) {
|
||||
f := func(s string) {
|
||||
t.Helper()
|
||||
_, err := parseAuthConfig([]byte(s))
|
||||
if err == nil {
|
||||
t.Fatalf("expecting non-nil error")
|
||||
}
|
||||
}
|
||||
|
||||
// Empty config
|
||||
f(``)
|
||||
|
||||
// Invalid entry
|
||||
f(`foobar`)
|
||||
f(`foobar: baz`)
|
||||
|
||||
// Empty users
|
||||
f(`users: []`)
|
||||
|
||||
// Missing url_prefix
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
`)
|
||||
|
||||
// Invalid url_prefix
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: bar
|
||||
`)
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: ftp://bar
|
||||
`)
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: //bar
|
||||
`)
|
||||
|
||||
// Duplicate users
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: http://foo.bar
|
||||
- username: bar
|
||||
url_prefix: http://xxx.yyy
|
||||
- username: foo
|
||||
url_prefix: https://sss.sss
|
||||
`)
|
||||
}
|
||||
|
||||
func TestParseAuthConfigSuccess(t *testing.T) {
|
||||
f := func(s string, expectedAuthConfig map[string]*UserInfo) {
|
||||
t.Helper()
|
||||
m, err := parseAuthConfig([]byte(s))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
removeMetrics(m)
|
||||
if !reflect.DeepEqual(m, expectedAuthConfig) {
|
||||
t.Fatalf("unexpected auth config\ngot\n%v\nwant\n%v", m, expectedAuthConfig)
|
||||
}
|
||||
}
|
||||
|
||||
// Single user
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
password: bar
|
||||
url_prefix: http://aaa:343/bbb
|
||||
`, map[string]*UserInfo{
|
||||
"foo": {
|
||||
Username: "foo",
|
||||
Password: "bar",
|
||||
URLPrefix: "http://aaa:343/bbb",
|
||||
},
|
||||
})
|
||||
|
||||
// Multiple users
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: http://foo
|
||||
- username: bar
|
||||
url_prefix: https://bar/x///
|
||||
`, map[string]*UserInfo{
|
||||
"foo": {
|
||||
Username: "foo",
|
||||
URLPrefix: "http://foo",
|
||||
},
|
||||
"bar": {
|
||||
Username: "bar",
|
||||
URLPrefix: "https://bar/x",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func removeMetrics(m map[string]*UserInfo) {
|
||||
for _, info := range m {
|
||||
info.requests = nil
|
||||
}
|
||||
}
|
||||
8
app/vmauth/deployment/Dockerfile
Normal file
8
app/vmauth/deployment/Dockerfile
Normal file
@@ -0,0 +1,8 @@
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
EXPOSE 8427
|
||||
|
||||
ENTRYPOINT ["/vmauth-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./vmauth-prod
|
||||
31
app/vmauth/example_config.yml
Normal file
31
app/vmauth/example_config.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Arbitrary number of usernames may be put here.
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
|
||||
104
app/vmauth/main.go
Normal file
104
app/vmauth/main.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
var (
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
logger.Infof("starting vmauth at %q...", *httpListenAddr)
|
||||
startTime := time.Now()
|
||||
initAuthConfig()
|
||||
go httpserver.Serve(*httpListenAddr, requestHandler)
|
||||
logger.Infof("started vmauth in %.3f seconds", time.Since(startTime).Seconds())
|
||||
|
||||
sig := procutil.WaitForSigterm()
|
||||
logger.Infof("received signal %s", sig)
|
||||
|
||||
startTime = time.Now()
|
||||
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
|
||||
if err := httpserver.Stop(*httpListenAddr); err != nil {
|
||||
logger.Fatalf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
|
||||
stopAuthConfig()
|
||||
logger.Infof("successfully stopped vmauth in %.3f seconds", time.Since(startTime).Seconds())
|
||||
}
|
||||
|
||||
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
username, password, ok := r.BasicAuth()
|
||||
if !ok {
|
||||
httpserver.Errorf(w, "Missing `Authorization: Basic *` header")
|
||||
return true
|
||||
}
|
||||
ac := authConfig.Load().(map[string]*UserInfo)
|
||||
info := ac[username]
|
||||
if info == nil || info.Password != password {
|
||||
httpserver.Errorf(w, "Cannot find the provided username %q or password in config", username)
|
||||
return true
|
||||
}
|
||||
info.requests.Inc()
|
||||
|
||||
targetURL := createTargetURL(info.URLPrefix, r.URL)
|
||||
if _, err := url.Parse(targetURL); err != nil {
|
||||
httpserver.Errorf(w, "Invalid targetURL=%q: %s", targetURL, err)
|
||||
return true
|
||||
}
|
||||
r.Header.Set("vm-target-url", targetURL)
|
||||
reverseProxy.ServeHTTP(w, r)
|
||||
return true
|
||||
}
|
||||
|
||||
var reverseProxy = &httputil.ReverseProxy{
|
||||
Director: func(r *http.Request) {
|
||||
targetURL := r.Header.Get("vm-target-url")
|
||||
target, err := url.Parse(targetURL)
|
||||
if err != nil {
|
||||
logger.Panicf("BUG: unexpected error when parsing targetURL=%q: %s", targetURL, err)
|
||||
}
|
||||
r.URL = target
|
||||
},
|
||||
Transport: func() *http.Transport {
|
||||
tr := http.DefaultTransport.(*http.Transport).Clone()
|
||||
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
|
||||
tr.DisableCompression = true
|
||||
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
|
||||
tr.ForceAttemptHTTP2 = false
|
||||
return tr
|
||||
}(),
|
||||
FlushInterval: time.Second,
|
||||
ErrorLog: logger.StdErrorLogger(),
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
|
||||
`
|
||||
|
||||
f := flag.CommandLine.Output()
|
||||
fmt.Fprintf(f, "%s\n", s)
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
16
app/vmauth/target_url.go
Normal file
16
app/vmauth/target_url.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"path"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func createTargetURL(prefix string, u *url.URL) string {
|
||||
// Prevent from attacks with using `..` in r.URL.Path
|
||||
u.Path = path.Clean(u.Path)
|
||||
if !strings.HasPrefix(u.Path, "/") {
|
||||
u.Path = "/" + u.Path
|
||||
}
|
||||
return prefix + u.RequestURI()
|
||||
}
|
||||
26
app/vmauth/target_url_test.go
Normal file
26
app/vmauth/target_url_test.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateTargetURL(t *testing.T) {
|
||||
f := func(prefix, requestURI, expectedTarget string) {
|
||||
t.Helper()
|
||||
u, err := url.Parse(requestURI)
|
||||
if err != nil {
|
||||
t.Fatalf("cannot parse %q: %s", requestURI, err)
|
||||
}
|
||||
target := createTargetURL(prefix, u)
|
||||
if target != expectedTarget {
|
||||
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
|
||||
}
|
||||
}
|
||||
f("http://foo.bar", "", "http://foo.bar/.")
|
||||
f("http://foo.bar", "/", "http://foo.bar/")
|
||||
f("http://foo.bar", "a/b?c=d", "http://foo.bar/a/b?c=d")
|
||||
f("https://sss:3894/x/y", "/z", "https://sss:3894/x/y/z")
|
||||
f("https://sss:3894/x/y", "/../../aaa", "https://sss:3894/x/y/aaa")
|
||||
f("https://sss:3894/x/y", "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
|
||||
}
|
||||
@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
|
||||
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
|
||||
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
|
||||
creation of hourly, daily, weekly and monthly backups.
|
||||
|
||||
|
||||
### Use cases
|
||||
|
||||
@@ -86,6 +89,8 @@ or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when
|
||||
|
||||
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
|
||||
|
||||
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
|
||||
|
||||
|
||||
### How does it work?
|
||||
|
||||
@@ -118,6 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
|
||||
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
|
||||
|
||||
|
||||
### Advanced usage
|
||||
@@ -140,14 +147,28 @@ Run `vmbackup -help` in order to see all the available options:
|
||||
-dst string
|
||||
Where to put the backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-maxBytesPerSecond int
|
||||
The maximum upload speed. There is no limit if it is set to 0
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
-origin string
|
||||
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
|
||||
-snapshot.createURL string
|
||||
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup.Example: http://victoriametrics:8428/snaphsot/create
|
||||
-snapshot.deleteURL string
|
||||
VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted.Example: http://victoriametrics:8428/snaphsot/delete
|
||||
-snapshotName string
|
||||
Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots
|
||||
-storageDataPath string
|
||||
@@ -164,7 +185,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmbackup` from the root folder of the repository.
|
||||
It builds `vmbackup` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -179,3 +200,10 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` docker image locally.
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
|
||||
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-vmbackup
|
||||
```
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
ARG certs_image
|
||||
FROM $certs_image AS certs
|
||||
FROM scratch
|
||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
ENTRYPOINT ["/vmbackup-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./vmbackup-prod
|
||||
ENTRYPOINT ["/vmbackup-prod"]
|
||||
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmbackup/snapshot"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
|
||||
@@ -14,9 +16,13 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
|
||||
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
|
||||
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
|
||||
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
|
||||
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup."+
|
||||
"Example: http://victoriametrics:8428/snaphsot/create")
|
||||
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted."+
|
||||
"Example: http://victoriametrics:8428/snaphsot/delete")
|
||||
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
|
||||
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
|
||||
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
|
||||
origin = flag.String("origin", "", "Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups")
|
||||
@@ -25,10 +31,40 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
|
||||
if len(*snapshotCreateURL) > 0 {
|
||||
logger.Infof("%s", "Snapshots enabled")
|
||||
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
|
||||
if len(*snapshotDeleteURL) <= 0 {
|
||||
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
|
||||
if err != nil {
|
||||
logger.Fatalf("Failed to set snapshot.deleteURL flag: %v", err)
|
||||
}
|
||||
}
|
||||
logger.Infof("Snapshot delete url %s", *snapshotDeleteURL)
|
||||
|
||||
name, err := snapshot.Create(*snapshotCreateURL)
|
||||
if err != nil {
|
||||
logger.Fatalf("%s", err)
|
||||
}
|
||||
err = flag.Set("snapshotName", name)
|
||||
if err != nil {
|
||||
logger.Fatalf("Failed to set snapshotName flag: %v", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
err := snapshot.Delete(*snapshotDeleteURL, name)
|
||||
if err != nil {
|
||||
logger.Fatalf("%s", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
srcFS, err := newSrcFS()
|
||||
if err != nil {
|
||||
logger.Fatalf("%s", err)
|
||||
@@ -67,19 +103,19 @@ See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/a
|
||||
|
||||
func newSrcFS() (*fslocal.FS, error) {
|
||||
if len(*snapshotName) == 0 {
|
||||
return nil, fmt.Errorf("`-snapshotName` cannot be empty")
|
||||
return nil, fmt.Errorf("`-snapshotName` or `-snapshot.createURL` must be provided")
|
||||
}
|
||||
snapshotPath := *storageDataPath + "/snapshots/" + *snapshotName
|
||||
|
||||
// Verify the snapshot exists.
|
||||
f, err := os.Open(snapshotPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
|
||||
}
|
||||
fi, err := f.Stat()
|
||||
_ = f.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
|
||||
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
|
||||
}
|
||||
if !fi.IsDir() {
|
||||
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
|
||||
@@ -90,7 +126,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -98,7 +134,7 @@ func newSrcFS() (*fslocal.FS, error) {
|
||||
func newDstFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*dst)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
|
||||
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -109,7 +145,7 @@ func newOriginFS() (common.RemoteFS, error) {
|
||||
}
|
||||
fs, err := actions.NewRemoteFS(*origin)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
|
||||
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
91
app/vmbackup/snapshot/snapshot.go
Normal file
91
app/vmbackup/snapshot/snapshot.go
Normal file
@@ -0,0 +1,91 @@
|
||||
package snapshot
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
type snapshot struct {
|
||||
Status string `json:"status"`
|
||||
Snapshot string `json:"snapshot"`
|
||||
Msg string `json:"msg"`
|
||||
}
|
||||
|
||||
// Create creates a snapshot and the provided api endpoint and returns
|
||||
// the snapshot name
|
||||
func Create(createSnapshotURL string) (string, error) {
|
||||
logger.Infof("%s", "Creating snapshot")
|
||||
u, err := url.Parse(createSnapshotURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
resp, err := http.Get(u.String())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
snap := snapshot{}
|
||||
err = json.Unmarshal(body, &snap)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if snap.Status == "ok" {
|
||||
logger.Infof("Snapshot %s created", snap.Snapshot)
|
||||
return snap.Snapshot, nil
|
||||
} else if snap.Status == "error" {
|
||||
return "", errors.New(snap.Msg)
|
||||
} else {
|
||||
return "", fmt.Errorf("Unkown status: %v", snap.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete deletes a snapshot and the provided api endpoint returns any failure
|
||||
func Delete(deleteSnapshotURL string, snapshotName string) error {
|
||||
logger.Infof("Deleting snapshot %s", snapshotName)
|
||||
formData := url.Values{
|
||||
"snapshot": {snapshotName},
|
||||
}
|
||||
|
||||
u, err := url.Parse(deleteSnapshotURL)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resp, err := http.PostForm(u.String(), formData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
snap := snapshot{}
|
||||
err = json.Unmarshal(body, &snap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if snap.Status == "ok" {
|
||||
logger.Infof("Snapshot %s deleted", snapshotName)
|
||||
return nil
|
||||
} else if snap.Status == "error" {
|
||||
return errors.New(snap.Msg)
|
||||
} else {
|
||||
return fmt.Errorf("Unkown status: %v", snap.Status)
|
||||
}
|
||||
}
|
||||
106
app/vmbackup/snapshot/snapshot_test.go
Normal file
106
app/vmbackup/snapshot/snapshot_test.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package snapshot
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateSnapshot(t *testing.T) {
|
||||
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/snapshot/create" {
|
||||
_, err := io.WriteString(w, `{"status":"ok","snapshot":"mysnapshot"}`)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to write response output: %v", err)
|
||||
}
|
||||
} else {
|
||||
t.Fatalf("Invalid path, got %v", r.URL.Path)
|
||||
}
|
||||
})
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(handler))
|
||||
defer server.Close()
|
||||
|
||||
snapshotName, err := Create(server.URL + "/snapshot/create")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed taking snapshot: %v", err)
|
||||
}
|
||||
|
||||
if snapshotName != "mysnapshot" {
|
||||
t.Fatalf("Snapshot name is not correct, got %v", snapshotName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSnapshotFailed(t *testing.T) {
|
||||
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/snapshot/create" {
|
||||
_, err := io.WriteString(w, `{"status":"error","msg":"I am unwell"}`)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to write response output: %v", err)
|
||||
}
|
||||
} else {
|
||||
t.Fatalf("Invalid path, got %v", r.URL.Path)
|
||||
}
|
||||
})
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(handler))
|
||||
defer server.Close()
|
||||
|
||||
snapshotName, err := Create(server.URL + "/snapshot/create")
|
||||
if err == nil {
|
||||
t.Fatalf("Snapshot did not fail, got snapshot: %v", snapshotName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteSnapshot(t *testing.T) {
|
||||
snapshotName := "mysnapshot"
|
||||
|
||||
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/snapshot/delete" {
|
||||
_, err := io.WriteString(w, `{"status":"ok"}`)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to write response output: %v", err)
|
||||
}
|
||||
} else {
|
||||
t.Fatalf("Invalid path, got %v", r.URL.Path)
|
||||
}
|
||||
if r.FormValue("snapshot") != snapshotName {
|
||||
t.Fatalf("Invalid snapshot name, got %v", snapshotName)
|
||||
}
|
||||
})
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(handler))
|
||||
defer server.Close()
|
||||
|
||||
err := Delete(server.URL+"/snapshot/delete", snapshotName)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to delete snapshot: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteSnapshotFailed(t *testing.T) {
|
||||
snapshotName := "mysnapshot"
|
||||
|
||||
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/snapshot/delete" {
|
||||
_, err := io.WriteString(w, `{"status":"error", "msg":"failed to delete"}`)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to write response output: %v", err)
|
||||
}
|
||||
} else {
|
||||
t.Fatalf("Invalid path, got %v", r.URL.Path)
|
||||
}
|
||||
if r.FormValue("snapshot") != snapshotName {
|
||||
t.Fatalf("Invalid snapshot name, got %v", snapshotName)
|
||||
}
|
||||
})
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(handler))
|
||||
defer server.Close()
|
||||
|
||||
err := Delete(server.URL+"/snapshot/delete", snapshotName)
|
||||
if err == nil {
|
||||
t.Fatalf("Snapshot should have failed, got: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -122,7 +122,7 @@ func (ctx *InsertCtx) AddLabel(name, value string) {
|
||||
func (ctx *InsertCtx) FlushBufs() error {
|
||||
if err := vmstorage.AddRows(ctx.mrs); err != nil {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("cannot store metrics: %s", err),
|
||||
Err: fmt.Errorf("cannot store metrics: %w", err),
|
||||
StatusCode: http.StatusServiceUnavailable,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
|
||||
@@ -19,6 +20,7 @@ import (
|
||||
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
|
||||
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
|
||||
opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
|
||||
@@ -130,6 +132,11 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
promscrape.WriteHumanReadableTargetsStatus(w)
|
||||
return true
|
||||
case "/-/reload":
|
||||
promscrapeConfigReloadRequests.Inc()
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return true
|
||||
default:
|
||||
// This is not our link
|
||||
return false
|
||||
@@ -152,4 +159,16 @@ var (
|
||||
influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/query", protocol="influx"}`)
|
||||
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
_ = metrics.NewGauge(`vm_metrics_with_dropped_labels_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.MetricsWithDroppedLabels))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_names_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelNames))
|
||||
})
|
||||
_ = metrics.NewGauge(`vm_too_long_label_values_total`, func() float64 {
|
||||
return float64(atomic.LoadUint64(&storage.TooLongLabelValues))
|
||||
})
|
||||
)
|
||||
|
||||
@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
|
||||
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
|
||||
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
|
||||
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
|
||||
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
|
||||
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
|
||||
|
||||
|
||||
### Troubleshooting
|
||||
@@ -47,17 +48,29 @@ Run `vmrestore -help` in order to see all the available options:
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
-customS3Endpoint string
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-maxBytesPerSecond int
|
||||
The maximum download speed. There is no limit if it is set to 0
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
-skipBackupCompleteCheck
|
||||
Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file
|
||||
-src string
|
||||
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-storageDataPath string
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
|
||||
-version
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
@@ -69,7 +82,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
#### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmrestore` from the root folder of the repository.
|
||||
It builds `vmrestore` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -84,3 +97,10 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` docker image locally.
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
|
||||
|
||||
By default the image is built on top of [alpine](https://hub.docker.com/_/alpine) image. It is possible to build the package on top of any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-vmrestore
|
||||
```
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
ARG certs_image
|
||||
FROM $certs_image AS certs
|
||||
FROM scratch
|
||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
ARG base_image
|
||||
FROM $base_image
|
||||
|
||||
ENTRYPOINT ["/vmrestore-prod"]
|
||||
ARG src_binary
|
||||
COPY $src_binary ./vmrestore-prod
|
||||
ENTRYPOINT ["/vmrestore-prod"]
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
|
||||
@@ -16,13 +17,16 @@ var (
|
||||
src = flag.String("src", "", "Source path with backup on the remote storage. "+
|
||||
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir")
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Destination path where backup must be restored. "+
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup")
|
||||
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir "+
|
||||
"is synchronized with -src contents, i.e. it works like 'rsync --delete'")
|
||||
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce restore duration")
|
||||
maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum download speed. There is no limit if it is set to 0")
|
||||
skipBackupCompleteCheck = flag.Bool("skipBackupCompleteCheck", false, "Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file")
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Write flags and help message to stdout, since it is easier to grep or pipe.
|
||||
flag.CommandLine.SetOutput(os.Stdout)
|
||||
flag.Usage = usage
|
||||
envflag.Parse()
|
||||
buildinfo.Init()
|
||||
@@ -67,7 +71,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
MaxBytesPerSecond: *maxBytesPerSecond,
|
||||
}
|
||||
if err := fs.Init(); err != nil {
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %s", err)
|
||||
return nil, fmt.Errorf("cannot initialize local fs: %w", err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
@@ -75,7 +79,7 @@ func newDstFS() (*fslocal.FS, error) {
|
||||
func newSrcFS() (common.RemoteFS, error) {
|
||||
fs, err := actions.NewRemoteFS(*src)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %s", *src, err)
|
||||
return nil, fmt.Errorf("cannot parse `-src`=%q: %w", *src, err)
|
||||
}
|
||||
return fs, nil
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package vmselect
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -8,11 +9,9 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
|
||||
@@ -24,7 +23,7 @@ var (
|
||||
maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+
|
||||
"It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration")
|
||||
maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached")
|
||||
resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetCache call")
|
||||
resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call")
|
||||
)
|
||||
|
||||
func getDefaultMaxConcurrentRequests() int {
|
||||
@@ -43,9 +42,6 @@ func getDefaultMaxConcurrentRequests() int {
|
||||
|
||||
// Init initializes vmselect
|
||||
func Init() {
|
||||
tmpDirPath := *vmstorage.DataPath + "/tmp"
|
||||
fs.RemoveDirContents(tmpDirPath)
|
||||
netstorage.InitTmpBlocksDir(tmpDirPath)
|
||||
promql.InitRollupResultCache(*vmstorage.DataPath + "/cache/rollupResult")
|
||||
|
||||
concurrencyCh = make(chan struct{}, *maxConcurrentRequests)
|
||||
@@ -179,6 +175,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return true
|
||||
}
|
||||
return true
|
||||
case "/api/v1/status/tsdb":
|
||||
tsdbStatusRequests.Inc()
|
||||
if err := prometheus.TSDBStatusHandler(startTime, w, r); err != nil {
|
||||
tsdbStatusErrors.Inc()
|
||||
sendPrometheusError(w, r, err)
|
||||
return true
|
||||
}
|
||||
return true
|
||||
case "/api/v1/export":
|
||||
exportRequests.Inc()
|
||||
if err := prometheus.ExportHandler(startTime, w, r); err != nil {
|
||||
@@ -191,7 +195,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
federateRequests.Inc()
|
||||
if err := prometheus.FederateHandler(startTime, w, r); err != nil {
|
||||
federateErrors.Inc()
|
||||
httpserver.Errorf(w, "error int %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
|
||||
return true
|
||||
}
|
||||
return true
|
||||
@@ -233,11 +237,12 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
|
||||
func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
|
||||
logger.Errorf("error in %q: %s", r.RequestURI, err)
|
||||
logger.Warnf("error in %q: %s", r.RequestURI, err)
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
statusCode := http.StatusUnprocessableEntity
|
||||
if esc, ok := err.(*httpserver.ErrorWithStatusCode); ok {
|
||||
var esc *httpserver.ErrorWithStatusCode
|
||||
if errors.As(err, &esc) {
|
||||
statusCode = esc.StatusCode
|
||||
}
|
||||
w.WriteHeader(statusCode)
|
||||
@@ -266,6 +271,9 @@ var (
|
||||
labelsCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels/count"}`)
|
||||
labelsCountErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels/count"}`)
|
||||
|
||||
tsdbStatusRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/tsdb"}`)
|
||||
tsdbStatusErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/tsdb"}`)
|
||||
|
||||
deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`)
|
||||
deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`)
|
||||
|
||||
|
||||
@@ -7,13 +7,12 @@ import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
@@ -53,9 +52,8 @@ type Results struct {
|
||||
fetchData bool
|
||||
deadline Deadline
|
||||
|
||||
tbf *tmpBlocksFile
|
||||
|
||||
packedTimeseries []packedTimeseries
|
||||
sr *storage.Search
|
||||
}
|
||||
|
||||
// Len returns the number of results in rss.
|
||||
@@ -65,8 +63,56 @@ func (rss *Results) Len() int {
|
||||
|
||||
// Cancel cancels rss work.
|
||||
func (rss *Results) Cancel() {
|
||||
putTmpBlocksFile(rss.tbf)
|
||||
rss.tbf = nil
|
||||
rss.mustClose()
|
||||
}
|
||||
|
||||
func (rss *Results) mustClose() {
|
||||
putStorageSearch(rss.sr)
|
||||
rss.sr = nil
|
||||
}
|
||||
|
||||
var timeseriesWorkCh = make(chan *timeseriesWork, gomaxprocs)
|
||||
|
||||
type timeseriesWork struct {
|
||||
rss *Results
|
||||
pts *packedTimeseries
|
||||
f func(rs *Result, workerID uint)
|
||||
doneCh chan error
|
||||
|
||||
rowsProcessed int
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go timeseriesWorker(uint(i))
|
||||
}
|
||||
}
|
||||
|
||||
func timeseriesWorker(workerID uint) {
|
||||
var rs Result
|
||||
var rsLastResetTime uint64
|
||||
for tsw := range timeseriesWorkCh {
|
||||
rss := tsw.rss
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
tsw.doneCh <- fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
continue
|
||||
}
|
||||
if err := tsw.pts.Unpack(&rs, rss.tr, rss.fetchData); err != nil {
|
||||
tsw.doneCh <- fmt.Errorf("error during time series unpacking: %w", err)
|
||||
continue
|
||||
}
|
||||
if len(rs.Timestamps) > 0 || !rss.fetchData {
|
||||
tsw.f(&rs, workerID)
|
||||
}
|
||||
tsw.rowsProcessed = len(rs.Values)
|
||||
tsw.doneCh <- nil
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
if cap(rs.Values) > 1024*1024 && 4*len(rs.Values) < cap(rs.Values) && currentTime-rsLastResetTime > 10 {
|
||||
// Reset rs in order to preseve memory usage after processing big time series with millions of rows.
|
||||
rs = Result{}
|
||||
rsLastResetTime = currentTime
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunParallel runs in parallel f for all the results from rss.
|
||||
@@ -76,77 +122,38 @@ func (rss *Results) Cancel() {
|
||||
//
|
||||
// rss becomes unusable after the call to RunParallel.
|
||||
func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
|
||||
defer func() {
|
||||
putTmpBlocksFile(rss.tbf)
|
||||
rss.tbf = nil
|
||||
}()
|
||||
|
||||
workersCount := 1 + len(rss.packedTimeseries)/32
|
||||
if workersCount > gomaxprocs {
|
||||
workersCount = gomaxprocs
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
workCh := make(chan *packedTimeseries, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers.
|
||||
rowsProcessedTotal := uint64(0)
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func(workerID uint) {
|
||||
rs := getResult()
|
||||
defer putResult(rs)
|
||||
maxWorkersCount := gomaxprocs / workersCount
|
||||
|
||||
var err error
|
||||
rowsProcessed := 0
|
||||
for pts := range workCh {
|
||||
if time.Until(rss.deadline.Deadline) < 0 {
|
||||
err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
|
||||
break
|
||||
}
|
||||
if err = pts.Unpack(rss.tbf, rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
|
||||
break
|
||||
}
|
||||
if len(rs.Timestamps) == 0 && rss.fetchData {
|
||||
// Skip empty blocks.
|
||||
continue
|
||||
}
|
||||
rowsProcessed += len(rs.Values)
|
||||
f(rs, workerID)
|
||||
}
|
||||
atomic.AddUint64(&rowsProcessedTotal, uint64(rowsProcessed))
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}(uint(i))
|
||||
}
|
||||
defer rss.mustClose()
|
||||
|
||||
// Feed workers with work.
|
||||
tsws := make([]*timeseriesWork, len(rss.packedTimeseries))
|
||||
for i := range rss.packedTimeseries {
|
||||
workCh <- &rss.packedTimeseries[i]
|
||||
tsw := ×eriesWork{
|
||||
rss: rss,
|
||||
pts: &rss.packedTimeseries[i],
|
||||
f: f,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
timeseriesWorkCh <- tsw
|
||||
tsws[i] = tsw
|
||||
}
|
||||
seriesProcessedTotal := len(rss.packedTimeseries)
|
||||
rss.packedTimeseries = rss.packedTimeseries[:0]
|
||||
close(workCh)
|
||||
|
||||
// Wait until workers finish.
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete.
|
||||
var firstErr error
|
||||
rowsProcessedTotal := 0
|
||||
for _, tsw := range tsws {
|
||||
if err := <-tsw.doneCh; err != nil && firstErr == nil {
|
||||
// Return just the first error, since other errors
|
||||
// are likely duplicate the first error.
|
||||
firstErr = err
|
||||
}
|
||||
rowsProcessedTotal += tsw.rowsProcessed
|
||||
}
|
||||
|
||||
perQueryRowsProcessed.Update(float64(rowsProcessedTotal))
|
||||
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal))
|
||||
if len(errors) > 0 {
|
||||
// Return just the first error, since other errors
|
||||
// is likely duplicate the first error.
|
||||
return errors[0]
|
||||
}
|
||||
return nil
|
||||
return firstErr
|
||||
}
|
||||
|
||||
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`)
|
||||
@@ -156,73 +163,77 @@ var gomaxprocs = runtime.GOMAXPROCS(-1)
|
||||
|
||||
type packedTimeseries struct {
|
||||
metricName string
|
||||
addrs []tmpBlockAddr
|
||||
brs []storage.BlockRef
|
||||
}
|
||||
|
||||
var unpackWorkCh = make(chan *unpackWork, gomaxprocs)
|
||||
|
||||
type unpackWork struct {
|
||||
br storage.BlockRef
|
||||
tr storage.TimeRange
|
||||
fetchData bool
|
||||
doneCh chan error
|
||||
sb *sortBlock
|
||||
}
|
||||
|
||||
func init() {
|
||||
for i := 0; i < gomaxprocs; i++ {
|
||||
go unpackWorker()
|
||||
}
|
||||
}
|
||||
|
||||
func unpackWorker() {
|
||||
for upw := range unpackWorkCh {
|
||||
sb := getSortBlock()
|
||||
if err := sb.unpackFrom(upw.br, upw.tr, upw.fetchData); err != nil {
|
||||
putSortBlock(sb)
|
||||
upw.doneCh <- fmt.Errorf("cannot unpack block: %w", err)
|
||||
continue
|
||||
}
|
||||
upw.sb = sb
|
||||
upw.doneCh <- nil
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack unpacks pts to dst.
|
||||
func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
|
||||
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool) error {
|
||||
dst.reset()
|
||||
|
||||
if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %s", pts.metricName, err)
|
||||
}
|
||||
|
||||
workersCount := 1 + len(pts.addrs)/32
|
||||
if workersCount > maxWorkersCount {
|
||||
workersCount = maxWorkersCount
|
||||
}
|
||||
if workersCount == 0 {
|
||||
logger.Panicf("BUG: workersCount cannot be zero")
|
||||
}
|
||||
|
||||
sbs := make([]*sortBlock, 0, len(pts.addrs))
|
||||
var sbsLock sync.Mutex
|
||||
|
||||
workCh := make(chan tmpBlockAddr, workersCount)
|
||||
doneCh := make(chan error)
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < workersCount; i++ {
|
||||
go func() {
|
||||
var err error
|
||||
for addr := range workCh {
|
||||
sb := getSortBlock()
|
||||
if err = sb.unpackFrom(tbf, addr, tr, fetchData); err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
sbsLock.Lock()
|
||||
sbs = append(sbs, sb)
|
||||
sbsLock.Unlock()
|
||||
}
|
||||
|
||||
// Drain the remaining work
|
||||
for range workCh {
|
||||
}
|
||||
doneCh <- err
|
||||
}()
|
||||
return fmt.Errorf("cannot unmarshal metricName %q: %w", pts.metricName, err)
|
||||
}
|
||||
|
||||
// Feed workers with work
|
||||
for _, addr := range pts.addrs {
|
||||
workCh <- addr
|
||||
upws := make([]*unpackWork, len(pts.brs))
|
||||
for i, br := range pts.brs {
|
||||
upw := &unpackWork{
|
||||
br: br,
|
||||
tr: tr,
|
||||
fetchData: fetchData,
|
||||
doneCh: make(chan error, 1),
|
||||
}
|
||||
unpackWorkCh <- upw
|
||||
upws[i] = upw
|
||||
}
|
||||
pts.addrs = pts.addrs[:0]
|
||||
close(workCh)
|
||||
pts.brs = pts.brs[:0]
|
||||
|
||||
// Wait until workers finish
|
||||
var errors []error
|
||||
for i := 0; i < workersCount; i++ {
|
||||
if err := <-doneCh; err != nil {
|
||||
errors = append(errors, err)
|
||||
// Wait until work is complete
|
||||
sbs := make([]*sortBlock, 0, len(pts.brs))
|
||||
var firstErr error
|
||||
for _, upw := range upws {
|
||||
if err := <-upw.doneCh; err != nil && firstErr == nil {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
firstErr = err
|
||||
}
|
||||
if firstErr == nil {
|
||||
sbs = append(sbs, upw.sb)
|
||||
} else {
|
||||
putSortBlock(upw.sb)
|
||||
}
|
||||
}
|
||||
if len(errors) > 0 {
|
||||
// Return the first error only, since other errors are likely the same.
|
||||
return errors[0]
|
||||
if firstErr != nil {
|
||||
return firstErr
|
||||
}
|
||||
|
||||
// Merge blocks
|
||||
mergeSortBlocks(dst, sbs)
|
||||
return nil
|
||||
}
|
||||
@@ -314,11 +325,11 @@ func (sb *sortBlock) reset() {
|
||||
sb.NextIdx = 0
|
||||
}
|
||||
|
||||
func (sb *sortBlock) unpackFrom(tbf *tmpBlocksFile, addr tmpBlockAddr, tr storage.TimeRange, fetchData bool) error {
|
||||
tbf.MustReadBlockAt(&sb.b, addr)
|
||||
func (sb *sortBlock) unpackFrom(br storage.BlockRef, tr storage.TimeRange, fetchData bool) error {
|
||||
br.MustReadBlock(&sb.b, fetchData)
|
||||
if fetchData {
|
||||
if err := sb.b.UnmarshalData(); err != nil {
|
||||
return fmt.Errorf("cannot unmarshal block: %s", err)
|
||||
return fmt.Errorf("cannot unmarshal block: %w", err)
|
||||
}
|
||||
}
|
||||
timestamps := sb.b.Timestamps()
|
||||
@@ -387,7 +398,7 @@ func DeleteSeries(sq *storage.SearchQuery) (int, error) {
|
||||
func GetLabels(deadline Deadline) ([]string, error) {
|
||||
labels, err := vmstorage.SearchTagKeys(*maxTagKeysPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during labels search: %s", err)
|
||||
return nil, fmt.Errorf("error during labels search: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -413,7 +424,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
// Search for tag values
|
||||
labelValues, err := vmstorage.SearchTagValues([]byte(labelName), *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %s", labelName, err)
|
||||
return nil, fmt.Errorf("error during label values search for labelName=%q: %w", labelName, err)
|
||||
}
|
||||
|
||||
// Sort labelValues like Prometheus does
|
||||
@@ -426,7 +437,7 @@ func GetLabelValues(labelName string, deadline Deadline) ([]string, error) {
|
||||
func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
labelEntries, err := vmstorage.SearchTagEntries(*maxTagKeysPerSearch, *maxTagValuesPerSearch)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during label entries request: %s", err)
|
||||
return nil, fmt.Errorf("error during label entries request: %w", err)
|
||||
}
|
||||
|
||||
// Substitute "" with "__name__"
|
||||
@@ -449,11 +460,20 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
|
||||
return labelEntries, nil
|
||||
}
|
||||
|
||||
// GetTSDBStatusForDate returns tsdb status according to https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats
|
||||
func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) {
|
||||
status, err := vmstorage.GetTSDBStatusForDate(date, topN)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error during tsdb status request: %w", err)
|
||||
}
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// GetSeriesCount returns the number of unique series.
|
||||
func GetSeriesCount(deadline Deadline) (uint64, error) {
|
||||
n, err := vmstorage.GetSeriesCount()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error during series count request: %s", err)
|
||||
return 0, fmt.Errorf("error during series count request: %w", err)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
@@ -474,6 +494,8 @@ func putStorageSearch(sr *storage.Search) {
|
||||
var ssPool sync.Pool
|
||||
|
||||
// ProcessSearchQuery performs sq on storage nodes until the given deadline.
|
||||
//
|
||||
// Results.RunParallel or Results.Cancel must be called on the returned Results.
|
||||
func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadline) (*Results, error) {
|
||||
// Setup search.
|
||||
tfss, err := setupTfss(sq.TagFilterss)
|
||||
@@ -484,87 +506,51 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
|
||||
MinTimestamp: sq.MinTimestamp,
|
||||
MaxTimestamp: sq.MaxTimestamp,
|
||||
}
|
||||
if err := vmstorage.CheckTimeRange(tr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
vmstorage.WG.Add(1)
|
||||
defer vmstorage.WG.Done()
|
||||
|
||||
sr := getStorageSearch()
|
||||
defer putStorageSearch(sr)
|
||||
sr.Init(vmstorage.Storage, tfss, tr, fetchData, *maxMetricsPerSearch)
|
||||
sr.Init(vmstorage.Storage, tfss, tr, *maxMetricsPerSearch)
|
||||
|
||||
tbf := getTmpBlocksFile()
|
||||
m := make(map[string][]tmpBlockAddr)
|
||||
m := make(map[string][]storage.BlockRef)
|
||||
var orderedMetricNames []string
|
||||
blocksRead := 0
|
||||
bb := tmpBufPool.Get()
|
||||
defer tmpBufPool.Put(bb)
|
||||
for sr.NextMetricBlock() {
|
||||
blocksRead++
|
||||
bb.B = storage.MarshalBlock(bb.B[:0], sr.MetricBlock.Block)
|
||||
addr, err := tbf.WriteBlockData(bb.B)
|
||||
if err != nil {
|
||||
putTmpBlocksFile(tbf)
|
||||
return nil, fmt.Errorf("cannot write data block #%d to temporary blocks file: %s", blocksRead, err)
|
||||
}
|
||||
if time.Until(deadline.Deadline) < 0 {
|
||||
putTmpBlocksFile(tbf)
|
||||
return nil, fmt.Errorf("timeout exceeded while fetching data block #%d from storage: %s", blocksRead, deadline.String())
|
||||
}
|
||||
metricName := sr.MetricBlock.MetricName
|
||||
m[string(metricName)] = append(m[string(metricName)], addr)
|
||||
metricName := sr.MetricBlockRef.MetricName
|
||||
brs := m[string(metricName)]
|
||||
if len(brs) == 0 {
|
||||
orderedMetricNames = append(orderedMetricNames, string(metricName))
|
||||
}
|
||||
m[string(metricName)] = append(brs, *sr.MetricBlockRef.BlockRef)
|
||||
}
|
||||
if err := sr.Error(); err != nil {
|
||||
putTmpBlocksFile(tbf)
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %s", blocksRead, err)
|
||||
}
|
||||
if err := tbf.Finalize(); err != nil {
|
||||
putTmpBlocksFile(tbf)
|
||||
return nil, fmt.Errorf("cannot finalize temporary blocks file with %d blocks: %s", blocksRead, err)
|
||||
return nil, fmt.Errorf("search error after reading %d data blocks: %w", blocksRead, err)
|
||||
}
|
||||
|
||||
var rss Results
|
||||
rss.packedTimeseries = make([]packedTimeseries, len(m))
|
||||
rss.tr = tr
|
||||
rss.fetchData = fetchData
|
||||
rss.deadline = deadline
|
||||
rss.tbf = tbf
|
||||
i := 0
|
||||
for metricName, addrs := range m {
|
||||
pts := &rss.packedTimeseries[i]
|
||||
i++
|
||||
pts.metricName = metricName
|
||||
pts.addrs = addrs
|
||||
pts := make([]packedTimeseries, len(orderedMetricNames))
|
||||
for i, metricName := range orderedMetricNames {
|
||||
pts[i] = packedTimeseries{
|
||||
metricName: metricName,
|
||||
brs: m[metricName],
|
||||
}
|
||||
}
|
||||
|
||||
// Sort rss.packedTimeseries by the first addr offset in order
|
||||
// to reduce the number of disk seeks during unpacking in RunParallel.
|
||||
// In this case tmpBlocksFile must be read almost sequentially.
|
||||
sort.Slice(rss.packedTimeseries, func(i, j int) bool {
|
||||
pts := rss.packedTimeseries
|
||||
return pts[i].addrs[0].offset < pts[j].addrs[0].offset
|
||||
})
|
||||
|
||||
rss.packedTimeseries = pts
|
||||
rss.sr = sr
|
||||
return &rss, nil
|
||||
}
|
||||
|
||||
func getResult() *Result {
|
||||
v := rsPool.Get()
|
||||
if v == nil {
|
||||
return &Result{}
|
||||
}
|
||||
return v.(*Result)
|
||||
}
|
||||
|
||||
func putResult(rs *Result) {
|
||||
if len(rs.Values) > 8192 {
|
||||
// Do not pool big results, since they may occupy too much memory.
|
||||
return
|
||||
}
|
||||
rs.reset()
|
||||
rsPool.Put(rs)
|
||||
}
|
||||
|
||||
var rsPool sync.Pool
|
||||
|
||||
func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error) {
|
||||
tfss := make([]*storage.TagFilters, 0, len(tagFilterss))
|
||||
for _, tagFilters := range tagFilterss {
|
||||
@@ -572,10 +558,11 @@ func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error)
|
||||
for i := range tagFilters {
|
||||
tf := &tagFilters[i]
|
||||
if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil {
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %s", tf, err)
|
||||
return nil, fmt.Errorf("cannot parse tag filter %s: %w", tf, err)
|
||||
}
|
||||
}
|
||||
tfss = append(tfss, tfs)
|
||||
tfss = append(tfss, tfs.Finalize()...)
|
||||
}
|
||||
return tfss, nil
|
||||
}
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
package netstorage
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// InitTmpBlocksDir initializes directory to store temporary search results.
|
||||
//
|
||||
// It stores data in system-defined temporary directory if tmpDirPath is empty.
|
||||
func InitTmpBlocksDir(tmpDirPath string) {
|
||||
if len(tmpDirPath) == 0 {
|
||||
tmpDirPath = os.TempDir()
|
||||
}
|
||||
tmpBlocksDir = tmpDirPath + "/searchResults"
|
||||
fs.MustRemoveAll(tmpBlocksDir)
|
||||
if err := fs.MkdirAllIfNotExist(tmpBlocksDir); err != nil {
|
||||
logger.Panicf("FATAL: cannot create %q: %s", tmpBlocksDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
var tmpBlocksDir string
|
||||
|
||||
func maxInmemoryTmpBlocksFile() int {
|
||||
mem := memory.Allowed()
|
||||
maxLen := mem / 1024
|
||||
if maxLen < 64*1024 {
|
||||
return 64 * 1024
|
||||
}
|
||||
if maxLen > 4*1024*1024 {
|
||||
return 4 * 1024 * 1024
|
||||
}
|
||||
return maxLen
|
||||
}
|
||||
|
||||
var _ = metrics.NewGauge(`vm_tmp_blocks_max_inmemory_file_size_bytes`, func() float64 {
|
||||
return float64(maxInmemoryTmpBlocksFile())
|
||||
})
|
||||
|
||||
type tmpBlocksFile struct {
|
||||
buf []byte
|
||||
|
||||
f *os.File
|
||||
r *fs.ReaderAt
|
||||
|
||||
offset uint64
|
||||
}
|
||||
|
||||
func getTmpBlocksFile() *tmpBlocksFile {
|
||||
v := tmpBlocksFilePool.Get()
|
||||
if v == nil {
|
||||
return &tmpBlocksFile{
|
||||
buf: make([]byte, 0, maxInmemoryTmpBlocksFile()),
|
||||
}
|
||||
}
|
||||
return v.(*tmpBlocksFile)
|
||||
}
|
||||
|
||||
func putTmpBlocksFile(tbf *tmpBlocksFile) {
|
||||
tbf.MustClose()
|
||||
tbf.buf = tbf.buf[:0]
|
||||
tbf.f = nil
|
||||
tbf.r = nil
|
||||
tbf.offset = 0
|
||||
tmpBlocksFilePool.Put(tbf)
|
||||
}
|
||||
|
||||
var tmpBlocksFilePool sync.Pool
|
||||
|
||||
type tmpBlockAddr struct {
|
||||
offset uint64
|
||||
size int
|
||||
}
|
||||
|
||||
func (addr tmpBlockAddr) String() string {
|
||||
return fmt.Sprintf("offset %d, size %d", addr.offset, addr.size)
|
||||
}
|
||||
|
||||
var tmpBlocksFilesCreated = metrics.NewCounter(`vm_tmp_blocks_files_created_total`)
|
||||
|
||||
// WriteBlockData writes b to tbf.
|
||||
//
|
||||
// It returns errors since the operation may fail on space shortage
|
||||
// and this must be handled.
|
||||
func (tbf *tmpBlocksFile) WriteBlockData(b []byte) (tmpBlockAddr, error) {
|
||||
var addr tmpBlockAddr
|
||||
addr.offset = tbf.offset
|
||||
addr.size = len(b)
|
||||
tbf.offset += uint64(addr.size)
|
||||
if len(tbf.buf)+len(b) <= cap(tbf.buf) {
|
||||
// Fast path - the data fits tbf.buf
|
||||
tbf.buf = append(tbf.buf, b...)
|
||||
return addr, nil
|
||||
}
|
||||
|
||||
// Slow path: flush the data from tbf.buf to file.
|
||||
if tbf.f == nil {
|
||||
f, err := ioutil.TempFile(tmpBlocksDir, "")
|
||||
if err != nil {
|
||||
return addr, err
|
||||
}
|
||||
tbf.f = f
|
||||
tmpBlocksFilesCreated.Inc()
|
||||
}
|
||||
_, err := tbf.f.Write(tbf.buf)
|
||||
tbf.buf = append(tbf.buf[:0], b...)
|
||||
if err != nil {
|
||||
return addr, fmt.Errorf("cannot write block to %q: %s", tbf.f.Name(), err)
|
||||
}
|
||||
return addr, nil
|
||||
}
|
||||
|
||||
func (tbf *tmpBlocksFile) Finalize() error {
|
||||
if tbf.f == nil {
|
||||
return nil
|
||||
}
|
||||
fname := tbf.f.Name()
|
||||
if _, err := tbf.f.Write(tbf.buf); err != nil {
|
||||
return fmt.Errorf("cannot write the remaining %d bytes to %q: %s", len(tbf.buf), fname, err)
|
||||
}
|
||||
tbf.buf = tbf.buf[:0]
|
||||
r, err := fs.OpenReaderAt(fname)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot open %q: %s", fname, err)
|
||||
}
|
||||
// Hint the OS that the file is read almost sequentiallly.
|
||||
// This should reduce the number of disk seeks, which is important
|
||||
// for HDDs.
|
||||
r.MustFadviseSequentialRead(true)
|
||||
tbf.r = r
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tbf *tmpBlocksFile) MustReadBlockAt(dst *storage.Block, addr tmpBlockAddr) {
|
||||
var buf []byte
|
||||
if tbf.f == nil {
|
||||
buf = tbf.buf[addr.offset : addr.offset+uint64(addr.size)]
|
||||
} else {
|
||||
bb := tmpBufPool.Get()
|
||||
defer tmpBufPool.Put(bb)
|
||||
bb.B = bytesutil.Resize(bb.B, addr.size)
|
||||
tbf.r.MustReadAt(bb.B, int64(addr.offset))
|
||||
buf = bb.B
|
||||
}
|
||||
tail, err := storage.UnmarshalBlock(dst, buf)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot unmarshal data at %s: %s", addr, err)
|
||||
}
|
||||
if len(tail) > 0 {
|
||||
logger.Panicf("FATAL: unexpected non-empty tail left after unmarshaling data at %s; len(tail)=%d", addr, len(tail))
|
||||
}
|
||||
}
|
||||
|
||||
var tmpBufPool bytesutil.ByteBufferPool
|
||||
|
||||
func (tbf *tmpBlocksFile) MustClose() {
|
||||
if tbf.f == nil {
|
||||
return
|
||||
}
|
||||
if tbf.r != nil {
|
||||
// tbf.r could be nil if Finalize wasn't called.
|
||||
tbf.r.MustClose()
|
||||
}
|
||||
fname := tbf.f.Name()
|
||||
|
||||
// Remove the file at first, then close it.
|
||||
// This way the OS shouldn't try to flush file contents to storage
|
||||
// on close.
|
||||
if err := os.Remove(fname); err != nil {
|
||||
logger.Panicf("FATAL: cannot remove %q: %s", fname, err)
|
||||
}
|
||||
if err := tbf.f.Close(); err != nil {
|
||||
logger.Panicf("FATAL: cannot close %q: %s", fname, err)
|
||||
}
|
||||
tbf.f = nil
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user