mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-08 19:33:35 +03:00
Compare commits
443 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef690932ee | ||
|
|
a95b96979c | ||
|
|
a96eb16329 | ||
|
|
2b59fff526 | ||
|
|
57143e9435 | ||
|
|
7bad7133bc | ||
|
|
ad35068c3a | ||
|
|
5acd70109b | ||
|
|
569b0d444c | ||
|
|
50cf74ce4b | ||
|
|
077193d87c | ||
|
|
7da20a4b3f | ||
|
|
cde1e2ec93 | ||
|
|
319e910897 | ||
|
|
cae61c85d4 | ||
|
|
7ecb72648d | ||
|
|
29cebb3d95 | ||
|
|
4785d04312 | ||
|
|
0c0efc7781 | ||
|
|
4ecb86c179 | ||
|
|
d4f14f4879 | ||
|
|
d011446f6f | ||
|
|
70bb0d2708 | ||
|
|
43df19a742 | ||
|
|
3d3b9e3b59 | ||
|
|
19ecc4b2c3 | ||
|
|
f47d67d836 | ||
|
|
4aa5f70f21 | ||
|
|
73789b333f | ||
|
|
72fd976cb3 | ||
|
|
f166f80f15 | ||
|
|
0fd4c48568 | ||
|
|
e2b1097545 | ||
|
|
f977ca8eaf | ||
|
|
1c38ff6f48 | ||
|
|
a9b6cf53a2 | ||
|
|
1354e6d712 | ||
|
|
0989649ad0 | ||
|
|
0123295d50 | ||
|
|
56de8f0356 | ||
|
|
e210384f7e | ||
|
|
cb878d50fc | ||
|
|
3a2a60cb08 | ||
|
|
2ea540a5aa | ||
|
|
c8d29ed78e | ||
|
|
afc2e73948 | ||
|
|
9a88c1a91e | ||
|
|
6e364e19ef | ||
|
|
a462b97859 | ||
|
|
1fa0f3ec89 | ||
|
|
7ce40d74d7 | ||
|
|
7377163659 | ||
|
|
c46d9be108 | ||
|
|
f8dfc22350 | ||
|
|
76a477c609 | ||
|
|
23d0fc220d | ||
|
|
c8f356a6a8 | ||
|
|
b421a1f57b | ||
|
|
a8de1ab000 | ||
|
|
e1311409db | ||
|
|
f36e8debc7 | ||
|
|
c8c6f5b15e | ||
|
|
f367ff086c | ||
|
|
5ab6c350ec | ||
|
|
a1e17e91f8 | ||
|
|
82659ab5b6 | ||
|
|
87d356348b | ||
|
|
e78f3ac8ac | ||
|
|
ec03dec72d | ||
|
|
620b605786 | ||
|
|
20bb5e703c | ||
|
|
e3a10b327c | ||
|
|
2ae3a9a8a3 | ||
|
|
88605a7ea2 | ||
|
|
5d9b9b88b9 | ||
|
|
39aabdbadc | ||
|
|
563d76dedb | ||
|
|
27f87d4797 | ||
|
|
06a8a981c3 | ||
|
|
c0808a4146 | ||
|
|
db781a9342 | ||
|
|
e5868b9c29 | ||
|
|
65afe3b141 | ||
|
|
bba5e62911 | ||
|
|
220d193244 | ||
|
|
6e49ed4af0 | ||
|
|
91b9b5a808 | ||
|
|
7e7d8abc4a | ||
|
|
d63bb52c0f | ||
|
|
09c6c7350b | ||
|
|
e6acd16daf | ||
|
|
11869a8307 | ||
|
|
11ae1ae924 | ||
|
|
8ae9825bb4 | ||
|
|
7bfb5efaef | ||
|
|
f30044cd5c | ||
|
|
54ec080bbc | ||
|
|
3eef1ddc7d | ||
|
|
370024c7ed | ||
|
|
e03e46af4e | ||
|
|
fb6eab03a2 | ||
|
|
918ed5cb32 | ||
|
|
894416b4ca | ||
|
|
0fa7effc4b | ||
|
|
565bd08c43 | ||
|
|
77e9992fee | ||
|
|
ef1afeed6c | ||
|
|
8b21f40217 | ||
|
|
74bb9ea734 | ||
|
|
227d5182af | ||
|
|
7717967d42 | ||
|
|
702aa4948b | ||
|
|
df088dd78a | ||
|
|
9576bd875a | ||
|
|
7a0e1e252f | ||
|
|
470cd639c6 | ||
|
|
3f8ab2e4be | ||
|
|
ce8d28f8f4 | ||
|
|
0a4aadffac | ||
|
|
c84a8b34cc | ||
|
|
7da4068f48 | ||
|
|
e8fdb27625 | ||
|
|
59877d9f32 | ||
|
|
e757ebc58b | ||
|
|
9fe2e4e2c2 | ||
|
|
0d79c8cbef | ||
|
|
7e99bbb967 | ||
|
|
8bf3fb917a | ||
|
|
a16f1ae565 | ||
|
|
af5bdb9254 | ||
|
|
0d47c23a03 | ||
|
|
3f49bdaeff | ||
|
|
d128a5bf99 | ||
|
|
fbac1a9dad | ||
|
|
62b46007c5 | ||
|
|
acbea6c1ee | ||
|
|
205d34eae6 | ||
|
|
0017814ad4 | ||
|
|
58ca52faf8 | ||
|
|
3e91e15d1c | ||
|
|
87b393fb91 | ||
|
|
df5b0067ca | ||
|
|
f2b711b976 | ||
|
|
b9381ccf8b | ||
|
|
6d277396c3 | ||
|
|
b0c8337618 | ||
|
|
ec86f2289c | ||
|
|
0a38542a45 | ||
|
|
75f3db1f5c | ||
|
|
1685e181ae | ||
|
|
f72b35665f | ||
|
|
ed12c60826 | ||
|
|
5d45ea1003 | ||
|
|
5808774e06 | ||
|
|
5a7f9d1cf4 | ||
|
|
69d1893f4c | ||
|
|
f620f159a5 | ||
|
|
14799e69d7 | ||
|
|
6f34934b41 | ||
|
|
909712846d | ||
|
|
eb4fb60ee1 | ||
|
|
979d89f5dd | ||
|
|
e5ebdb9b1a | ||
|
|
b6ed9afd6d | ||
|
|
affaf373ea | ||
|
|
e93e168bdc | ||
|
|
7cd371f08f | ||
|
|
ea86716d06 | ||
|
|
debe75f51c | ||
|
|
3ac3124eed | ||
|
|
10590bde47 | ||
|
|
2b87b4d183 | ||
|
|
71ef3155c8 | ||
|
|
c3affb0c4f | ||
|
|
3c3805865b | ||
|
|
3d19fa6932 | ||
|
|
9dd191b27c | ||
|
|
5366d9be73 | ||
|
|
6ff71474a6 | ||
|
|
b71be42d90 | ||
|
|
21c92d7ef1 | ||
|
|
88a2659f1a | ||
|
|
445edcc6ac | ||
|
|
5748139aa4 | ||
|
|
424121c126 | ||
|
|
55facde841 | ||
|
|
ee5da826e9 | ||
|
|
b3e1119592 | ||
|
|
2efa46a11c | ||
|
|
64720d3c03 | ||
|
|
6f685c8e43 | ||
|
|
d91c1d4eee | ||
|
|
0f9e107e36 | ||
|
|
ce5082912b | ||
|
|
ad6bdd78d0 | ||
|
|
e29b2b8444 | ||
|
|
0d3e00e512 | ||
|
|
ac502785b6 | ||
|
|
1215f51043 | ||
|
|
3d890e89f1 | ||
|
|
75e84144c7 | ||
|
|
1d7c877b7b | ||
|
|
93c2db5546 | ||
|
|
578a37aa14 | ||
|
|
c90c1c4d54 | ||
|
|
d924f4b7ba | ||
|
|
f3c1c2e2ec | ||
|
|
f10c38b827 | ||
|
|
96dce63dbd | ||
|
|
b1f94f7f0e | ||
|
|
e08b74fcd6 | ||
|
|
33fd30ff61 | ||
|
|
a56b77db5b | ||
|
|
d8ffbf55a2 | ||
|
|
ea153e5f90 | ||
|
|
cf1a8bce6b | ||
|
|
08428464e9 | ||
|
|
e3adcbec6e | ||
|
|
3cb72ccc2a | ||
|
|
4e7f7f3302 | ||
|
|
f9a17cb5fe | ||
|
|
a9bb22b213 | ||
|
|
8f2d03fdc7 | ||
|
|
480e40b344 | ||
|
|
4e722c459b | ||
|
|
db8c4054e5 | ||
|
|
4507b111a9 | ||
|
|
2455a988e4 | ||
|
|
af77f449da | ||
|
|
c1997889f8 | ||
|
|
a6a2c5324a | ||
|
|
fb5614ab5c | ||
|
|
a1b494ac91 | ||
|
|
481ce692c7 | ||
|
|
107b637aef | ||
|
|
a6e66b1f6f | ||
|
|
26e85e642b | ||
|
|
42fe13995e | ||
|
|
5ea197f300 | ||
|
|
0028b2c6d1 | ||
|
|
a8acad7453 | ||
|
|
e855b202df | ||
|
|
3e783aa2a1 | ||
|
|
9bb60ab00f | ||
|
|
a19e7f8c5b | ||
|
|
de26d1ff23 | ||
|
|
d0f785defd | ||
|
|
46bd2c4d6d | ||
|
|
e86b7cc9a5 | ||
|
|
c3d02ee75a | ||
|
|
cde4664f0d | ||
|
|
21bd204e81 | ||
|
|
8b36044c93 | ||
|
|
baab622db6 | ||
|
|
cf3a041c2f | ||
|
|
865f09ecbb | ||
|
|
ba1b3b8ef2 | ||
|
|
b5b3c585b3 | ||
|
|
2968779f16 | ||
|
|
96b7de6736 | ||
|
|
4b850c2a59 | ||
|
|
4ef32df4fa | ||
|
|
6530bcedec | ||
|
|
a6587ded51 | ||
|
|
55e3bbd4cc | ||
|
|
f57982eddc | ||
|
|
5da71eb685 | ||
|
|
2016a2c899 | ||
|
|
d4b09896fa | ||
|
|
9c62b25ad6 | ||
|
|
4bdd10ab90 | ||
|
|
e13ce2ee98 | ||
|
|
a8509c112a | ||
|
|
a8d22e1223 | ||
|
|
f50cf60534 | ||
|
|
ead66155ef | ||
|
|
e7f1ceeb84 | ||
|
|
15475a9d1f | ||
|
|
d2ac954fe1 | ||
|
|
3d8a4bf023 | ||
|
|
7edf8be3bc | ||
|
|
6a519896db | ||
|
|
02a1a39796 | ||
|
|
4477a2e513 | ||
|
|
53852e35d8 | ||
|
|
b8a47c6589 | ||
|
|
9226b9917a | ||
|
|
5ae9892f5f | ||
|
|
86a7a72400 | ||
|
|
96aa3761fc | ||
|
|
1999bbfe82 | ||
|
|
97947c5fcf | ||
|
|
f6899cc289 | ||
|
|
527bee4b1e | ||
|
|
2e59b17108 | ||
|
|
e02e0508da | ||
|
|
ac92d471a6 | ||
|
|
f0eb1f3749 | ||
|
|
74a2297dcc | ||
|
|
e3995572bb | ||
|
|
2ef3fabcb8 | ||
|
|
a41c34705e | ||
|
|
f4989edd96 | ||
|
|
91f2af2d7a | ||
|
|
285bb2bbec | ||
|
|
4c13bae1cf | ||
|
|
132425eb46 | ||
|
|
c3ea279080 | ||
|
|
b38c54a25e | ||
|
|
b4ec350a94 | ||
|
|
789bee8792 | ||
|
|
7d73bb4f40 | ||
|
|
c60b5d4f00 | ||
|
|
015eb6faa7 | ||
|
|
624107deae | ||
|
|
746ee191e8 | ||
|
|
f5f27a5fbf | ||
|
|
0d7374ad2f | ||
|
|
ceb1376267 | ||
|
|
ad5059f2d3 | ||
|
|
e46b7d33a7 | ||
|
|
ede93469ea | ||
|
|
5f84b17ed6 | ||
|
|
3ea054a52c | ||
|
|
adbb821eac | ||
|
|
eb4bd92fac | ||
|
|
00b7c97d2a | ||
|
|
ea87f21e23 | ||
|
|
9797c928ef | ||
|
|
145337792d | ||
|
|
84f6b3014c | ||
|
|
56168d8565 | ||
|
|
109363de49 | ||
|
|
ccf04239e6 | ||
|
|
98edeac7b7 | ||
|
|
d79a915583 | ||
|
|
8f5902dfcf | ||
|
|
bce7d7ac60 | ||
|
|
919ee73153 | ||
|
|
f2cc4e0436 | ||
|
|
0b0bf94c96 | ||
|
|
672fcba223 | ||
|
|
5a77c86e97 | ||
|
|
8bdc45ba00 | ||
|
|
70737ea4ac | ||
|
|
dcadec65b6 | ||
|
|
8e3f9c1fbb | ||
|
|
ca11def2a5 | ||
|
|
e933e3150d | ||
|
|
fcd33fc409 | ||
|
|
c2a3911bb5 | ||
|
|
dbfa1421ac | ||
|
|
74a4c29729 | ||
|
|
44f4c4f9ba | ||
|
|
ce602827e5 | ||
|
|
dc7b63a793 | ||
|
|
a5265e2a56 | ||
|
|
060f17d1d8 | ||
|
|
aba94ef4d6 | ||
|
|
e0314ad8ca | ||
|
|
fc76ecde13 | ||
|
|
1bdc71d917 | ||
|
|
f41846d002 | ||
|
|
96707223db | ||
|
|
d7d83d6d93 | ||
|
|
1d05444b33 | ||
|
|
4e84c38b70 | ||
|
|
831b93a755 | ||
|
|
80f03177c4 | ||
|
|
80f966b80c | ||
|
|
355a63733d | ||
|
|
c883c15878 | ||
|
|
9469696e46 | ||
|
|
4e7026320a | ||
|
|
7d5ed49d23 | ||
|
|
5c321c7178 | ||
|
|
17eb86a689 | ||
|
|
68a117a25a | ||
|
|
7a2c46d951 | ||
|
|
b434be3d2d | ||
|
|
bd9e30c054 | ||
|
|
90c844576e | ||
|
|
ae897372bc | ||
|
|
ad2fc75676 | ||
|
|
c2dbc642e7 | ||
|
|
2e7b537b68 | ||
|
|
f847efe621 | ||
|
|
77bfa8181d | ||
|
|
e47385d34a | ||
|
|
71fa1c8baf | ||
|
|
bdba50432b | ||
|
|
e4e36383e2 | ||
|
|
bc03ab6688 | ||
|
|
46c310f62f | ||
|
|
b8369e2f3e | ||
|
|
dd1b789c15 | ||
|
|
c70c064752 | ||
|
|
ae89b4e818 | ||
|
|
dbe592597f | ||
|
|
178dd87e26 | ||
|
|
38bf5fc136 | ||
|
|
e1d7cbfc77 | ||
|
|
ced5f2e5e7 | ||
|
|
60266078ca | ||
|
|
5ce94e1dd3 | ||
|
|
ac47733044 | ||
|
|
ceade70d4e | ||
|
|
89ff7b2465 | ||
|
|
042570584f | ||
|
|
8262372d72 | ||
|
|
6e75129c77 | ||
|
|
cbeaa000ef | ||
|
|
72d127e187 | ||
|
|
70bd94b50b | ||
|
|
b1b67169f1 | ||
|
|
a8d74e15dd | ||
|
|
7339645a29 | ||
|
|
12d0a59074 | ||
|
|
923bb42cb9 | ||
|
|
a6a39a2591 | ||
|
|
5721804047 | ||
|
|
498b166e5f | ||
|
|
cc63f80193 | ||
|
|
2104330d4c | ||
|
|
681a800086 | ||
|
|
f0c331c724 | ||
|
|
b5ce35dfc8 | ||
|
|
543bd0ea0c | ||
|
|
cbaa2af280 | ||
|
|
c7826ab36e | ||
|
|
8ff7da7202 | ||
|
|
f40b1e7e9f | ||
|
|
9e17b51d45 | ||
|
|
0f97c34204 | ||
|
|
06cf4e0f70 | ||
|
|
9bb7905d26 | ||
|
|
4b40acd964 | ||
|
|
ce333f28d8 | ||
|
|
3cfb90b227 | ||
|
|
34fdc8881b | ||
|
|
5dc9ab5829 | ||
|
|
d44cc14c6b | ||
|
|
ee17516afd |
@@ -4,3 +4,4 @@ gocache-for-docker
|
||||
victoria-metrics-data
|
||||
vmstorage-data
|
||||
vmselect-cache
|
||||
.vscode
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,6 +4,7 @@
|
||||
*.pprof
|
||||
/bin
|
||||
.idea
|
||||
.vscode
|
||||
*.test
|
||||
*.swp
|
||||
/gocache-for-docker
|
||||
|
||||
@@ -68,9 +68,9 @@ members of the project's leadership.
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
||||
available at <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see
|
||||
https://www.contributor-covenant.org/faq
|
||||
<https://www.contributor-covenant.org/faq>
|
||||
|
||||
@@ -79,7 +79,7 @@
|
||||
|
||||
**Последствия**: Предупреждение о последствиях в случае продолжающегося неуместного поведения.
|
||||
На определенное время не допускается взаимодействие с людьми, вовлеченными в инцидент,
|
||||
включая незапрошенное взаимодействие
|
||||
включая незапрошенное взаимодействие
|
||||
с теми, кто обеспечивает соблюдение Кодекса. Это включает в себя избегание взаимодействия
|
||||
в публичных пространствах, а так же во внешних каналах,
|
||||
таких как социальные сети. Нарушение этих правил влечет за собой временный или вечный бан.
|
||||
@@ -89,10 +89,10 @@
|
||||
**Общественное влияние**: Серьёзное нарушение стандартов сообщества,
|
||||
включая продолжительное неуместное поведение.
|
||||
|
||||
**Последствия**: Временный запрет (бан) на любое взаимодействие
|
||||
**Последствия**: Временный запрет (бан) на любое взаимодействие
|
||||
или публичное общение с сообществом на определенный период времени.
|
||||
На этот период не допускается публичное или личное взаимодействие с людьми,
|
||||
вовлеченными в инцидент, включая незапрошенное взаимодействие
|
||||
вовлеченными в инцидент, включая незапрошенное взаимодействие
|
||||
с теми, кто обеспечивает соблюдение Кодекса.
|
||||
Нарушение этих правил влечет за собой вечный бан.
|
||||
|
||||
@@ -108,7 +108,7 @@
|
||||
|
||||
Данный Кодекс Поведения основан на [Кодекс Поведения участника][homepage],
|
||||
версии 2.0, доступной по адресу
|
||||
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
||||
<https://www.contributor-covenant.org/version/2/0/code_of_conduct.html>.
|
||||
|
||||
Принципы Воздействия в Сообществе были вдохновлены [Mozilla's code of conduct
|
||||
enforcement ladder](https://github.com/mozilla/diversity).
|
||||
@@ -116,5 +116,5 @@ enforcement ladder](https://github.com/mozilla/diversity).
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
Ответы на общие вопросы о данном кодексе поведения ищите на странице FAQ:
|
||||
https://www.contributor-covenant.org/faq. Переводы доступны по адресу
|
||||
https://www.contributor-covenant.org/translations.
|
||||
<https://www.contributor-covenant.org/faq>. Переводы доступны по адресу
|
||||
<https://www.contributor-covenant.org/translations>.
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -175,7 +175,7 @@
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
Copyright 2019-2021 VictoriaMetrics, Inc.
|
||||
Copyright 2019-2022 VictoriaMetrics, Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
||||
86
Makefile
86
Makefile
@@ -100,66 +100,82 @@ release: \
|
||||
release-victoria-metrics: \
|
||||
release-victoria-metrics-amd64 \
|
||||
release-victoria-metrics-arm \
|
||||
release-victoria-metrics-arm64
|
||||
release-victoria-metrics-arm64 \
|
||||
release-victoria-metrics-darwin-amd64 \
|
||||
release-victoria-metrics-darwin-arm64
|
||||
|
||||
release-victoria-metrics-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-victoria-metrics-generic
|
||||
OSARCH=amd64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-arm:
|
||||
GOARCH=arm $(MAKE) release-victoria-metrics-generic
|
||||
OSARCH=arm $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-arm64:
|
||||
GOARCH=arm64 $(MAKE) release-victoria-metrics-generic
|
||||
OSARCH=arm64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-generic: victoria-metrics-$(GOARCH)-prod
|
||||
release-victoria-metrics-darwin-amd64:
|
||||
OSARCH=darwin-amd64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-darwin-arm64:
|
||||
OSARCH=darwin-arm64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-generic: victoria-metrics-$(OSARCH)-prod
|
||||
cd bin && \
|
||||
tar --transform="flags=r;s|-$(GOARCH)||" -czf victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
victoria-metrics-$(GOARCH)-prod \
|
||||
&& sha256sum victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
victoria-metrics-$(GOARCH)-prod \
|
||||
| sed s/-$(GOARCH)-prod/-prod/ > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
tar --transform="flags=r;s|-$(OSARCH)||" -czf victoria-metrics-$(OSARCH)-$(PKG_TAG).tar.gz \
|
||||
victoria-metrics-$(OSARCH)-prod \
|
||||
&& sha256sum victoria-metrics-$(OSARCH)-$(PKG_TAG).tar.gz \
|
||||
victoria-metrics-$(OSARCH)-prod \
|
||||
| sed s/-$(OSARCH)-prod/-prod/ > victoria-metrics-$(OSARCH)-$(PKG_TAG)_checksums.txt
|
||||
|
||||
release-vmutils: \
|
||||
release-vmutils-amd64 \
|
||||
release-vmutils-arm64 \
|
||||
release-vmutils-arm \
|
||||
release-vmutils-darwin-amd64 \
|
||||
release-vmutils-darwin-arm64 \
|
||||
release-vmutils-windows-amd64
|
||||
|
||||
release-vmutils-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-vmutils-generic
|
||||
OSARCH=amd64 $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-arm64:
|
||||
GOARCH=arm64 $(MAKE) release-vmutils-generic
|
||||
OSARCH=arm64 $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-arm:
|
||||
GOARCH=arm $(MAKE) release-vmutils-generic
|
||||
OSARCH=arm $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-darwin-amd64:
|
||||
OSARCH=darwin-amd64 $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-darwin-arm64:
|
||||
OSARCH=darwin-arm64 $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-windows-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-vmutils-windows-generic
|
||||
|
||||
release-vmutils-generic: \
|
||||
vmagent-$(GOARCH)-prod \
|
||||
vmalert-$(GOARCH)-prod \
|
||||
vmauth-$(GOARCH)-prod \
|
||||
vmbackup-$(GOARCH)-prod \
|
||||
vmrestore-$(GOARCH)-prod \
|
||||
vmctl-$(GOARCH)-prod
|
||||
vmagent-$(OSARCH)-prod \
|
||||
vmalert-$(OSARCH)-prod \
|
||||
vmauth-$(OSARCH)-prod \
|
||||
vmbackup-$(OSARCH)-prod \
|
||||
vmrestore-$(OSARCH)-prod \
|
||||
vmctl-$(OSARCH)-prod
|
||||
cd bin && \
|
||||
tar --transform="flags=r;s|-$(GOARCH)||" -czf vmutils-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOARCH)-prod \
|
||||
vmalert-$(GOARCH)-prod \
|
||||
vmauth-$(GOARCH)-prod \
|
||||
vmbackup-$(GOARCH)-prod \
|
||||
vmrestore-$(GOARCH)-prod \
|
||||
vmctl-$(GOARCH)-prod \
|
||||
&& sha256sum vmutils-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOARCH)-prod \
|
||||
vmalert-$(GOARCH)-prod \
|
||||
vmauth-$(GOARCH)-prod \
|
||||
vmbackup-$(GOARCH)-prod \
|
||||
vmrestore-$(GOARCH)-prod \
|
||||
vmctl-$(GOARCH)-prod \
|
||||
| sed s/-$(GOARCH)-prod/-prod/ > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
tar --transform="flags=r;s|-$(OSARCH)||" -czf vmutils-$(OSARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(OSARCH)-prod \
|
||||
vmalert-$(OSARCH)-prod \
|
||||
vmauth-$(OSARCH)-prod \
|
||||
vmbackup-$(OSARCH)-prod \
|
||||
vmrestore-$(OSARCH)-prod \
|
||||
vmctl-$(OSARCH)-prod \
|
||||
&& sha256sum vmutils-$(OSARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(OSARCH)-prod \
|
||||
vmalert-$(OSARCH)-prod \
|
||||
vmauth-$(OSARCH)-prod \
|
||||
vmbackup-$(OSARCH)-prod \
|
||||
vmrestore-$(OSARCH)-prod \
|
||||
vmctl-$(OSARCH)-prod \
|
||||
| sed s/-$(OSARCH)-prod/-prod/ > vmutils-$(OSARCH)-$(PKG_TAG)_checksums.txt
|
||||
|
||||
release-vmutils-windows-generic: \
|
||||
vmagent-windows-$(GOARCH)-prod \
|
||||
@@ -267,7 +283,7 @@ golangci-lint: install-golangci-lint
|
||||
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.43.0
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.45.1
|
||||
|
||||
install-wwhrd:
|
||||
which wwhrd || GO111MODULE=off go get github.com/frapposelli/wwhrd
|
||||
|
||||
@@ -27,6 +27,12 @@ victoria-metrics-ppc64le-prod:
|
||||
victoria-metrics-386-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-386
|
||||
|
||||
victoria-metrics-darwin-amd64-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
victoria-metrics-darwin-arm64-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
package-victoria-metrics:
|
||||
APP_NAME=victoria-metrics $(MAKE) package-via-docker
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ var (
|
||||
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the first sample in every time series per each discrete interval "+
|
||||
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication and https://docs.victoriametrics.com/#downsampling")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only -promscrape.config and then exit. "+
|
||||
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
|
||||
"Unknown config entries aren't allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse=false command-line flag")
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -86,6 +86,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
if r.Method != "GET" {
|
||||
return false
|
||||
}
|
||||
w.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics</h2></br>")
|
||||
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/'>https://docs.victoriametrics.com/</a></br>")
|
||||
fmt.Fprintf(w, "Useful endpoints:</br>")
|
||||
|
||||
@@ -27,6 +27,15 @@ vmagent-ppc64le-prod:
|
||||
vmagent-386-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-386
|
||||
|
||||
vmagent-darwin-amd64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmagent-darwin-arm64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmagent-windows-amd64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmagent:
|
||||
APP_NAME=vmagent $(MAKE) package-via-docker
|
||||
|
||||
@@ -81,6 +90,3 @@ vmagent-pure:
|
||||
|
||||
vmagent-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmagent $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmagent-windows-amd64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -6,14 +6,12 @@ or any other Prometheus-compatible storage systems that support the `remote_writ
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
|
||||
## Motivation
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters into VictoriaMetrics.
|
||||
Also, we found that our user's infrastructure are like snowflakes in that no two are alike. Therefore we decided to add more flexibility
|
||||
to `vmagent` such as the ability to push metrics instead of pulling them. We did our best and will continue to improve vmagent.
|
||||
|
||||
to `vmagent` such as the ability to push metrics additionally to pulling them. We did our best and will continue to improve `vmagent`.
|
||||
|
||||
## Features
|
||||
|
||||
@@ -46,7 +44,7 @@ to `vmagent` such as the ability to push metrics instead of pulling them. We did
|
||||
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
|
||||
and configure the following flags to the `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (usually located at `/etc/prometheus/prometheus.yml`). The path can point either to local file or to http url.
|
||||
* `-promscrape.config` with the path to Prometheus config file (usually located at `/etc/prometheus/prometheus.yml`). The path can point either to local file or to http url. `vmagent` doesn't support some sections of Prometheus config file, so you may need either to delete these sections or to run `vmagent` with `-promscrape.config.strictParse=false` additional command-line flag, so `vmagent` will ignore unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections).
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics, the `-remoteWrite.url` argument can be specified multiple times to replicate data concurrently to an arbitrary number of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
@@ -67,7 +65,6 @@ Then send InfluxDB data to `http://vmagent-host:8429`. See [these docs](https://
|
||||
|
||||
Pass `-help` to `vmagent` in order to see [the full list of supported command-line flags with their descriptions](#advanced-usage).
|
||||
|
||||
|
||||
## Configuration update
|
||||
|
||||
`vmagent` should be restarted in order to update config options set via command-line args.
|
||||
@@ -75,6 +72,7 @@ Pass `-help` to `vmagent` in order to see [the full list of supported command-li
|
||||
`vmagent` supports multiple approaches for reloading configs from updated config files such as `-promscrape.config`, `-remoteWrite.relabelConfig` and `-remoteWrite.urlRelabelConfig`:
|
||||
|
||||
* Sending `SUGHUP` signal to `vmagent` process:
|
||||
|
||||
```bash
|
||||
kill -SIGHUP `pidof vmagent`
|
||||
```
|
||||
@@ -83,10 +81,8 @@ Pass `-help` to `vmagent` in order to see [the full list of supported command-li
|
||||
|
||||
There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file.
|
||||
|
||||
|
||||
## Use cases
|
||||
|
||||
|
||||
### IoT and Edge monitoring
|
||||
|
||||
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to their remote storage.
|
||||
@@ -97,28 +93,24 @@ The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
`vmagent` works on various architectures from the IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
|
||||
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
|
||||
|
||||
|
||||
### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding those metrics to remote storage
|
||||
then `vmagent` can replace Prometheus. Typically, `vmagent` requires lower amounts of RAM, CPU and network bandwidth compared with Prometheus.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instance.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again and then it sends the buffered data to the remote storage in order to prevent data gaps.
|
||||
|
||||
|
||||
### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to the remote storage. Additionally,
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
Please see [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
@@ -126,7 +118,6 @@ which is applied independently for each configured `-remoteWrite.url` destinatio
|
||||
data among long-term remote storage, short-term remote storage and a real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive it's own subset of the collected data due to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
|
||||
|
||||
### Prometheus remote_write proxy
|
||||
|
||||
`vmagent` can be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via the `remote_write` API
|
||||
@@ -134,7 +125,6 @@ at the`/api/v1/write` endpoint. Then apply relabeling and filtering and proxy it
|
||||
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
|
||||
Also, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
|
||||
|
||||
|
||||
### remote_write for clustered version
|
||||
|
||||
While `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes are always peformed in Promethes remote_write protocol. Therefore for the [clustered version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html), `-remoteWrite.url` the command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<accountID>/prometheus/api/v1/write` according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format). There is also support for multitenant writes. See [these docs](#multitenancy).
|
||||
@@ -143,7 +133,6 @@ While `vmagent` can accept data in several supported protocols (OpenTSDB, Influx
|
||||
|
||||
By default `vmagent` collects the data without tenant identifiers and routes it to the configured `-remoteWrite.url`. But it can accept multitenant data if `-remoteWrite.multitenantURL` is set. In this case it accepts multitenant data at `http://vmagent:8429/insert/<accountID>/...` in the same way as cluster version of VictoriaMetrics does according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format) and routes it to `<-remoteWrite.multitenantURL>/insert/<accountID>/prometheus/api/v1/write`. If multiple `-remoteWrite.multitenantURL` command-line options are set, then `vmagent` replicates the collected data across all the configured urls. This allows using a single `vmagent` instance in front of VictoriaMetrics clusters for processing the data from all the tenants.
|
||||
|
||||
|
||||
## How to collect metrics in Prometheus format
|
||||
|
||||
Specify the path to `prometheus.yml` file via `-promscrape.config` command-line flag. `vmagent` takes into account the following
|
||||
@@ -211,7 +200,6 @@ entries to 60s. Run `vmagent -help` in order to see default values for the `-pro
|
||||
|
||||
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders which are substituted by the corresponding `ENV_VAR` environment variable values.
|
||||
|
||||
|
||||
## Loading scrape configs from multiple files
|
||||
|
||||
`vmagent` supports loading scrape configs from multiple files specified in the `scrape_config_files` section of `-promscrape.config` file. For example, the following `-promscrape.config` instructs `vmagent` loading scrape configs from all the `*.yml` files under `configs` directory, from `single_scrape_config.yml` local file and from `https://config-server/scrape_config.yml` url:
|
||||
@@ -236,6 +224,17 @@ Every referred file can contain arbitrary number of [supported scrape configs](#
|
||||
|
||||
`vmagent` dynamically reloads these files on `SIGHUP` signal or on the request to `http://vmagent:8429/-/reload`.
|
||||
|
||||
## Unsupported Prometheus config sections
|
||||
|
||||
`vmagent` doesn't support the following sections in Prometheus config file passed to `-promscrape.config` command-line flag:
|
||||
|
||||
* [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This section is substituted with various `-remoteWrite*` command-line flags. See [the full list of flags](#advanced-usage). The `remote_write` section isn't supported in order to reduce possible confusion when `vmagent` is used for accepting incoming metrics via push protocols such as InfluxDB, Graphite, OpenTSDB, DataDog, etc. In this case the `-promscrape.config` file isn't needed. See [these docs](#features) for details.
|
||||
* `remote_read`. This section isn't supported at all.
|
||||
* `rule_files` and `alerting`. These sections are supported by [vmalert](https://docs.victoriametrics.com/vmalert.html).
|
||||
|
||||
The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format).
|
||||
|
||||
Additionally `vmagent` doesn't support `refresh_interval` option at service discovery sections. This option is substituted with `-promscrape.*CheckInterval` command-line options, which are specific per each service discovery type. See [the full list of command-line flags for vmagent](#advanced-usage).
|
||||
|
||||
## Adding labels to metrics
|
||||
|
||||
@@ -248,10 +247,9 @@ Labels can be added to metrics by the following mechanisms:
|
||||
/path/to/vmagent -remoteWrite.label=datacenter=foobar ...
|
||||
```
|
||||
|
||||
|
||||
## Relabeling
|
||||
|
||||
`vmagent` and VictoriaMetrics support Prometheus-compatible relabeling.
|
||||
VictoriaMetrics components (including `vmagent`) support Prometheus-compatible relabeling.
|
||||
They provide the following additional actions on top of actions from the [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config):
|
||||
|
||||
* `replace_all`: replaces all of the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the results in the `target_label`.
|
||||
@@ -276,6 +274,21 @@ The `regex` value can be split into multiple lines for improved readability and
|
||||
- "foo_.+"
|
||||
```
|
||||
|
||||
VictoriaMetrics components support an optional `if` filter, which can be used for conditional relabeling. The `if` filter may contain arbitrary [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors). For example, the following relabeling rule drops targets, which don't match `foo{bar="baz"}` series selector:
|
||||
|
||||
```yaml
|
||||
- action: keep
|
||||
if: 'foo{bar="baz"}'
|
||||
```
|
||||
|
||||
This is equivalent to less clear traditional relabeling rule:
|
||||
|
||||
```yaml
|
||||
- action: keep
|
||||
source_labels: [__name__, bar]
|
||||
regex: 'foo;baz'
|
||||
```
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
* At the `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels. This relabeling can be debugged by passing `relabel_debug: true` option to the corresponding `scrape_config` section. In this case `vmagent` logs target labels before and after the relabeling and then drops the logged target.
|
||||
@@ -292,7 +305,6 @@ You can read more about relabeling in the following articles:
|
||||
* [Extracting labels from legacy metric names](https://www.robustperception.io/extracting-labels-from-legacy-metric-names)
|
||||
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
|
||||
|
||||
|
||||
## Prometheus staleness markers
|
||||
|
||||
`vmagent` sends [Prometheus staleness markers](https://www.robustperception.io/staleness-and-promql) to `-remoteWrite.url` in the following cases:
|
||||
@@ -301,20 +313,18 @@ You can read more about relabeling in the following articles:
|
||||
* If the metric disappears from the list of scraped metrics, then stale marker is sent to this particular metric.
|
||||
* If the scrape target becomes temporarily unavailable, then stale markers are sent for all the metrics scraped from this target.
|
||||
* If the scrape target is removed from the list of targets, then stale markers are sent for all the metrics scraped from this target.
|
||||
* Stale markers are sent for all the scraped metrics on graceful shutdown of `vmagent`.
|
||||
|
||||
Prometheus staleness markers' tracking needs additional memory, since it must store the previous response body per each scrape target in order to compare it to the current response body. The memory usage may be reduced by passing `-promscrape.noStaleMarkers` command-line flag to `vmagent`. This disables staleness tracking. This also disables tracking the number of new time series per each scrape with the auto-generated `scrape_series_added` metric. See [these docs](https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series) for details.
|
||||
|
||||
|
||||
## Stream parsing mode
|
||||
|
||||
By default `vmagent` reads the full response body from scrape target into memory, then parses it, applies [relabeling](#relabeling) and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode. When this mode is enabled, then `vmagent` reads response from scrape target in chunks, then immediately processes every chunk and pushes the processed metrics to remote storage. This allows saving memory when scraping targets that expose millions of metrics.
|
||||
|
||||
Stream parsing mode is automatically enabled for scrape targets returning response bodies with sizes bigger than the `-promscrape.minResponseSizeForStreamParse` command-line flag value. Additionally, the stream parsing mode can be explicitly enabled in the following places:
|
||||
|
||||
- Via `-promscrape.streamParse` command-line flag. In this case all the scrape targets defined in the file pointed by `-promscrape.config` are scraped in stream parsing mode.
|
||||
- Via `stream_parse: true` option at `scrape_configs` section. In this case all the scrape targets defined in this section are scraped in stream parsing mode.
|
||||
- Via `__stream_parse__=true` label, which can be set via [relabeling](#relabeling) at `relabel_configs` section. In this case stream parsing mode is enabled for the corresponding scrape targets. Typical use case: to set the label via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets exposing big number of metrics.
|
||||
* Via `-promscrape.streamParse` command-line flag. In this case all the scrape targets defined in the file pointed by `-promscrape.config` are scraped in stream parsing mode.
|
||||
* Via `stream_parse: true` option at `scrape_configs` section. In this case all the scrape targets defined in this section are scraped in stream parsing mode.
|
||||
* Via `__stream_parse__=true` label, which can be set via [relabeling](#relabeling) at `relabel_configs` section. In this case stream parsing mode is enabled for the corresponding scrape targets. Typical use case: to set the label via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets exposing big number of metrics.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -334,7 +344,6 @@ scrape_configs:
|
||||
|
||||
Note that `sample_limit` and `series_limit` options cannot be used in stream parsing mode because the parsed data is pushed to remote storage as soon as it is parsed.
|
||||
|
||||
|
||||
## Scraping big number of targets
|
||||
|
||||
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
|
||||
@@ -362,7 +371,6 @@ start a cluster of three `vmagent` instances, where each target is scraped by tw
|
||||
If each target is scraped by multiple `vmagent` instances, then data deduplication must be enabled at remote storage pointed by `-remoteWrite.url`.
|
||||
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
|
||||
|
||||
|
||||
## Scraping targets via a proxy
|
||||
|
||||
`vmagent` supports scraping targets via http, https and socks5 proxies. Proxy address must be specified in `proxy_url` option. For example, the following scrape config instructs
|
||||
@@ -402,9 +410,9 @@ scrape_configs:
|
||||
|
||||
By default `vmagent` doesn't limit the number of time series each scrape target can expose. The limit can be enforced in the following places:
|
||||
|
||||
- Via `-promscrape.seriesLimitPerTarget` command-line option. This limit is applied individually to all the scrape targets defined in the file pointed by `-promscrape.config`.
|
||||
- Via `series_limit` config option at `scrape_config` section. This limit is applied individually to all the scrape targets defined in the given `scrape_config`.
|
||||
- Via `__series_limit__` label, which can be set with [relabeling](#relabeling) at `relabel_configs` section. This limit is applied to the corresponding scrape targets. Typical use case: to set the limit via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets, which may expose too high number of time series.
|
||||
* Via `-promscrape.seriesLimitPerTarget` command-line option. This limit is applied individually to all the scrape targets defined in the file pointed by `-promscrape.config`.
|
||||
* Via `series_limit` config option at `scrape_config` section. This limit is applied individually to all the scrape targets defined in the given `scrape_config`.
|
||||
* Via `__series_limit__` label, which can be set with [relabeling](#relabeling) at `relabel_configs` section. This limit is applied to the corresponding scrape targets. Typical use case: to set the limit via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets, which may expose too high number of time series.
|
||||
|
||||
All the scraped metrics are dropped for time series exceeding the given limit. The exceeded limit can be [monitored](#monitoring) via `promscrape_series_limit_rows_dropped_total` metric.
|
||||
|
||||
@@ -424,7 +432,6 @@ The exceeded limits can be [monitored](#monitoring) with the following metrics:
|
||||
|
||||
These limits are approximate, so `vmagent` can underflow/overflow the limit by a small percentage (usually less than 1%).
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. We recommend setting up regular scraping of this page
|
||||
@@ -443,7 +450,6 @@ This information may be useful for debugging target relabeling.
|
||||
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes it's initialization for all service_discovery configs.
|
||||
It may be useful to perform `vmagent` rolling update without any scrape loss.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* We recommend you [set up the official Grafana dashboard](#monitoring) in order to monitor the state of `vmagent'.
|
||||
@@ -507,12 +513,14 @@ It may be useful to perform `vmagent` rolling update without any scrape loss.
|
||||
See the available options below if you prefer fixing the root cause of the error:
|
||||
|
||||
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
|
||||
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
The following relabeling rule may be added to `relabel_configs` section in order to filter out init container pods:
|
||||
|
||||
```yml
|
||||
- action: drop
|
||||
source_labels: [__meta_kubernetes_pod_container_init]
|
||||
@@ -521,17 +529,16 @@ It may be useful to perform `vmagent` rolling update without any scrape loss.
|
||||
|
||||
## Kafka integration
|
||||
|
||||
[Enterprise version](https://victoriametrics.com/enterprise.html) of `vmagent` can read and write metrics from / to Kafka:
|
||||
[Enterprise version](https://victoriametrics.com/products/enterprise/) of `vmagent` can read and write metrics from / to Kafka:
|
||||
|
||||
* [Reading metrics from Kafka](#reading-metrics-from-kafka)
|
||||
* [Writing metrics to Kafka](#writing-metrics-to-kafka)
|
||||
|
||||
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page in `vmutils-*-enteprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
|
||||
|
||||
|
||||
### Reading metrics from Kafka
|
||||
|
||||
[Enterprise version](https://victoriametrics.com/enterprise.html) of `vmagent` can read metrics in various formats from Kafka messages. These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line options. The following formats are supported:
|
||||
[Enterprise version](https://victoriametrics.com/products/enterprise/) of `vmagent` can read metrics in various formats from Kafka messages. These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line options. The following formats are supported:
|
||||
|
||||
* `promremotewrite` - [Prometheus remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). Messages in this format can be sent by vmagent - see [these docs](#writing-metrics-to-kafka).
|
||||
* `influx` - [InfluxDB line protocol format](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/).
|
||||
@@ -564,10 +571,9 @@ topic = "influx"
|
||||
data_format = "influx"
|
||||
```
|
||||
|
||||
|
||||
#### Command-line flags for Kafka consumer
|
||||
|
||||
These command-line flags are available only in [enterprise](https://victoriametrics.com/enterprise.html) version of `vmagent`, which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page (see `vmutils-*-enteprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
|
||||
These command-line flags are available only in [enterprise](https://victoriametrics.com/products/enterprise/) version of `vmagent`, which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page (see `vmutils-*-enteprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
|
||||
|
||||
```
|
||||
-kafka.consumer.topic array
|
||||
@@ -600,11 +606,10 @@ These command-line flags are available only in [enterprise](https://victoriametr
|
||||
|
||||
### Writing metrics to Kafka
|
||||
|
||||
[Enterprise version](https://victoriametrics.com/enterprise.html) of `vmagent` writes data to Kafka with `at-least-once` semantics if `-remoteWrite.url` contains e.g. Kafka url. For example, if `vmagent` is started with `-remoteWrite.url=kafka://localhost:9092/?topic=prom-rw`, then it would send Prometheus remote_write messages to Kafka bootstrap server at `localhost:9092` with the topic `prom-rw`. These messages can be read later from Kafka by another `vmagent` - see [these docs](#reading-metrics-from-kafka) for details.
|
||||
[Enterprise version](https://victoriametrics.com/products/enterprise/) of `vmagent` writes data to Kafka with `at-least-once` semantics if `-remoteWrite.url` contains e.g. Kafka url. For example, if `vmagent` is started with `-remoteWrite.url=kafka://localhost:9092/?topic=prom-rw`, then it would send Prometheus remote_write messages to Kafka bootstrap server at `localhost:9092` with the topic `prom-rw`. These messages can be read later from Kafka by another `vmagent` - see [these docs](#reading-metrics-from-kafka) for details.
|
||||
|
||||
Additional Kafka options can be passed as query params to `-remoteWrite.url`. For instance, `kafka://localhost:9092/?topic=prom-rw&client.id=my-favorite-id` sets `client.id` Kafka option to `my-favorite-id`. The full list of Kafka options is available [here](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md).
|
||||
|
||||
|
||||
#### Kafka broker authorization and authentication
|
||||
|
||||
Two types of auth are supported:
|
||||
@@ -621,15 +626,13 @@ Two types of auth are supported:
|
||||
./bin/vmagent -remoteWrite.url=kafka://localhost:9092/?topic=prom-rw&security.protocol=SSL -remoteWrite.tlsCAFile=/opt/ca.pem -remoteWrite.tlsCertFile=/opt/cert.pem -remoteWrite.tlsKeyFile=/opt/key.pem
|
||||
```
|
||||
|
||||
|
||||
## How to build from sources
|
||||
|
||||
We recommend using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-*` archives .
|
||||
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmagent` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds the `vmagent` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -658,7 +661,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
It builds `vmagent-arm` or `vmagent-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
@@ -668,28 +671,34 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmagent-arm-prod` or `vmagent-arm64-prod` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
|
||||
## Profiling
|
||||
|
||||
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile can be collected with the following command:
|
||||
* Memory profile can be collected with the following command (replace `0.0.0.0` with hostname if needed):
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
|
||||
curl http://0.0.0.0:8429/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile can be collected with the following command:
|
||||
</div>
|
||||
|
||||
* CPU profile can be collected with the following command (replace `0.0.0.0` with hostname if needed):
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
|
||||
curl http://0.0.0.0:8429/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
|
||||
|
||||
## Advanced usage
|
||||
|
||||
`vmagent` can be fine-tuned with various command-line flags. Run `./vmagent -help` in order to see the full list of these flags with their desciptions and default values:
|
||||
@@ -702,284 +711,314 @@ vmagent collects metrics data via popular data ingestion protocols and routes th
|
||||
See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
|
||||
-configAuthKey string
|
||||
Authorization key for accessing /config page. It must be passed via authKey query arg
|
||||
Authorization key for accessing /config page. It must be passed via authKey query arg
|
||||
-csvTrimTimestamp duration
|
||||
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-datadog.maxInsertRequestSize size
|
||||
The maximum size in bytes of a single DataDog POST request to /api/v1/series
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 67108864)
|
||||
The maximum size in bytes of a single DataDog POST request to /api/v1/series
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 67108864)
|
||||
-dryRun
|
||||
Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse
|
||||
Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-graphiteListenAddr string
|
||||
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty
|
||||
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty
|
||||
-graphiteTrimTimestamp duration
|
||||
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address to listen for http connections. Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. Note that /targets and /metrics pages aren't available if -httpListenAddr='' (default ":8429")
|
||||
TCP address to listen for http connections. Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. Note that /targets and /metrics pages aren't available if -httpListenAddr='' (default ":8429")
|
||||
-import.maxLineLen size
|
||||
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
|
||||
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
|
||||
-influx.databaseNames array
|
||||
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-influx.maxLineSize size
|
||||
The maximum size in bytes for a single InfluxDB line during parsing
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
|
||||
The maximum size in bytes for a single InfluxDB line during parsing
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
|
||||
-influxDBLabel string
|
||||
Default label for the DB name sent over '?db={db_name}' query parameter (default "db")
|
||||
-influxListenAddr string
|
||||
TCP and UDP address to listen for InfluxDB line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write
|
||||
TCP and UDP address to listen for InfluxDB line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write
|
||||
-influxMeasurementFieldSeparator string
|
||||
Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol (default "_")
|
||||
Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol (default "_")
|
||||
-influxSkipMeasurement
|
||||
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
|
||||
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
|
||||
-influxSkipSingleField
|
||||
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field
|
||||
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field
|
||||
-influxTrimTimestamp duration
|
||||
Trim timestamps for InfluxDB line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
Trim timestamps for InfluxDB line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-insert.maxQueueDuration duration
|
||||
The maximum duration for waiting in the queue for insert requests due to -maxConcurrentInserts (default 1m0s)
|
||||
The maximum duration for waiting in the queue for insert requests due to -maxConcurrentInserts (default 1m0s)
|
||||
-kafka.consumer.topic array
|
||||
Kafka topic names for data consumption.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.basicAuth.password array
|
||||
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN'
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.basicAuth.username array
|
||||
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN'
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.brokers array
|
||||
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.defaultFormat string
|
||||
Expected data format in the topic if -kafka.consumer.topic.format is skipped. (default "promremotewrite")
|
||||
-kafka.consumer.topic.format array
|
||||
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.groupID array
|
||||
Defines group.id for topic
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.isGzipped array
|
||||
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.options array
|
||||
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxConcurrentInserts int
|
||||
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
|
||||
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
|
||||
-maxInsertRequestSize size
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-opentsdbHTTPListenAddr string
|
||||
TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty
|
||||
TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty
|
||||
-opentsdbListenAddr string
|
||||
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
|
||||
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
|
||||
-opentsdbTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-opentsdbhttp.maxInsertRequestSize size
|
||||
The maximum size of OpenTSDB HTTP put request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
The maximum size of OpenTSDB HTTP put request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-opentsdbhttpTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-promscrape.cluster.memberNum int
|
||||
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
|
||||
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
|
||||
-promscrape.cluster.membersCount int
|
||||
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
-promscrape.cluster.replicationFactor int
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
-promscrape.config string
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
-promscrape.config.dryRun
|
||||
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
|
||||
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
|
||||
-promscrape.config.strictParse
|
||||
Whether to allow only supported fields in -promscrape.config . By default unsupported fields are silently skipped
|
||||
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
|
||||
-promscrape.configCheckInterval duration
|
||||
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
|
||||
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
|
||||
-promscrape.digitaloceanSDCheckInterval duration
|
||||
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
|
||||
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
|
||||
-promscrape.disableCompression
|
||||
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.disableKeepAlive
|
||||
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
|
||||
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
|
||||
-promscrape.discovery.concurrency int
|
||||
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
|
||||
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
|
||||
-promscrape.discovery.concurrentWaitTime duration
|
||||
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
|
||||
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
|
||||
-promscrape.dnsSDCheckInterval duration
|
||||
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
|
||||
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
|
||||
-promscrape.dockerSDCheckInterval duration
|
||||
Interval for checking for changes in docker. This works only if docker_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config for details (default 30s)
|
||||
Interval for checking for changes in docker. This works only if docker_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config for details (default 30s)
|
||||
-promscrape.dockerswarmSDCheckInterval duration
|
||||
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
|
||||
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
|
||||
-promscrape.dropOriginalLabels
|
||||
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
|
||||
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
|
||||
-promscrape.ec2SDCheckInterval duration
|
||||
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
|
||||
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
|
||||
-promscrape.eurekaSDCheckInterval duration
|
||||
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
|
||||
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
|
||||
-promscrape.fileSDCheckInterval duration
|
||||
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 30s)
|
||||
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 5m0s)
|
||||
-promscrape.gceSDCheckInterval duration
|
||||
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
|
||||
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
|
||||
-promscrape.httpSDCheckInterval duration
|
||||
Interval for checking for changes in http endpoint service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
|
||||
Interval for checking for changes in http endpoint service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
|
||||
-promscrape.kubernetes.apiServerTimeout duration
|
||||
How frequently to reload the full state from Kuberntes API server (default 30m0s)
|
||||
How frequently to reload the full state from Kuberntes API server (default 30m0s)
|
||||
-promscrape.kubernetesSDCheckInterval duration
|
||||
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
|
||||
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
|
||||
-promscrape.maxDroppedTargets int
|
||||
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
|
||||
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
|
||||
-promscrape.maxResponseHeadersSize size
|
||||
The maximum size of http response headers from Prometheus scrape targets
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 4096)
|
||||
The maximum size of http response headers from Prometheus scrape targets
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 4096)
|
||||
-promscrape.maxScrapeSize size
|
||||
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
|
||||
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
|
||||
-promscrape.minResponseSizeForStreamParse size
|
||||
The minimum target response size for automatic switching to stream parsing mode, which can reduce memory usage. See https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 1000000)
|
||||
The minimum target response size for automatic switching to stream parsing mode, which can reduce memory usage. See https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 1000000)
|
||||
-promscrape.noStaleMarkers
|
||||
Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. This option also disables populating the scrape_series_added metric. See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series
|
||||
Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. This option also disables populating the scrape_series_added metric. See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series
|
||||
-promscrape.openstackSDCheckInterval duration
|
||||
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
|
||||
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
|
||||
-promscrape.seriesLimitPerTarget int
|
||||
Optional limit on the number of unique time series a single scrape target can expose. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter for more info
|
||||
Optional limit on the number of unique time series a single scrape target can expose. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter for more info
|
||||
-promscrape.streamParse
|
||||
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.suppressDuplicateScrapeTargetErrors
|
||||
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
|
||||
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
|
||||
-promscrape.suppressScrapeErrors
|
||||
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
|
||||
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
|
||||
-remoteWrite.basicAuth.password array
|
||||
Optional basic auth password to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth password to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.basicAuth.passwordFile array
|
||||
Optional path to basic auth password to use for -remoteWrite.url. The file is re-read every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to basic auth password to use for -remoteWrite.url. The file is re-read every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.basicAuth.username array
|
||||
Optional basic auth username to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth username to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.bearerToken array
|
||||
Optional bearer auth token to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional bearer auth token to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.bearerTokenFile array
|
||||
Optional path to bearer token file to use for -remoteWrite.url. The token is re-read from the file every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to bearer token file to use for -remoteWrite.url. The token is re-read from the file every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.flushInterval duration
|
||||
Interval for flushing the data to remote storage. This option takes effect only when less than 10K data points per second are pushed to -remoteWrite.url (default 1s)
|
||||
Interval for flushing the data to remote storage. This option takes effect only when less than 10K data points per second are pushed to -remoteWrite.url (default 1s)
|
||||
-remoteWrite.label array
|
||||
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.maxBlockSize size
|
||||
The maximum block size to send to remote storage. Bigger blocks may improve performance at the cost of the increased memory usage. See also -remoteWrite.maxRowsPerBlock
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
|
||||
The maximum block size to send to remote storage. Bigger blocks may improve performance at the cost of the increased memory usage. See also -remoteWrite.maxRowsPerBlock
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
|
||||
-remoteWrite.maxDailySeries int
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
-remoteWrite.maxDiskUsagePerURL size
|
||||
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. Disk usage is unlimited if the value is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. Disk usage is unlimited if the value is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-remoteWrite.maxHourlySeries int
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
-remoteWrite.maxRowsPerBlock int
|
||||
The maximum number of samples to send in each block to remote storage. Higher number may improve performance at the cost of the increased memory usage. See also -remoteWrite.maxBlockSize (default 10000)
|
||||
The maximum number of samples to send in each block to remote storage. Higher number may improve performance at the cost of the increased memory usage. See also -remoteWrite.maxBlockSize (default 10000)
|
||||
-remoteWrite.multitenantURL array
|
||||
Base path for multitenant remote storage URL to write data to. See https://docs.victoriametrics.com/vmagent.html#multitenancy for details. Example url: http://<vminsert>:8480 . Pass multiple -remoteWrite.multitenantURL flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Base path for multitenant remote storage URL to write data to. See https://docs.victoriametrics.com/vmagent.html#multitenancy for details. Example url: http://<vminsert>:8480 . Pass multiple -remoteWrite.multitenantURL flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.clientID array
|
||||
Optional OAuth2 clientID to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional OAuth2 clientID to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.clientSecret array
|
||||
Optional OAuth2 clientSecret to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional OAuth2 clientSecret to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.clientSecretFile array
|
||||
Optional OAuth2 clientSecretFile to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional OAuth2 clientSecretFile to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.scopes array
|
||||
Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.tokenUrl array
|
||||
Optional OAuth2 tokenURL to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional OAuth2 tokenURL to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.proxyURL array
|
||||
Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.queues int
|
||||
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
|
||||
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
|
||||
-remoteWrite.rateLimit array
|
||||
Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.relabelConfig string
|
||||
Optional path to file with relabel_config entries. The path can point either to local file or to http url. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details
|
||||
Optional path to file with relabel_config entries. The path can point either to local file or to http url. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details
|
||||
-remoteWrite.relabelDebug
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs
|
||||
-remoteWrite.roundDigits array
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.sendTimeout array
|
||||
Timeout for sending a single block of data to -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Timeout for sending a single block of data to -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.showURL
|
||||
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
-remoteWrite.significantFigures array
|
||||
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsCAFile array
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsCertFile array
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsInsecureSkipVerify array
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsKeyFile array
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsServerName array
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tmpDataPath string
|
||||
Path to directory where temporary data for remote write component is stored. See also -remoteWrite.maxDiskUsagePerURL (default "vmagent-remotewrite-data")
|
||||
Path to directory where temporary data for remote write component is stored. See also -remoteWrite.maxDiskUsagePerURL (default "vmagent-remotewrite-data")
|
||||
-remoteWrite.url array
|
||||
Remote storage URL to write data to. It must support Prometheus remote_write API. It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . Pass multiple -remoteWrite.url flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.multitenantURL
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Remote storage URL to write data to. It must support Prometheus remote_write API. It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . Pass multiple -remoteWrite.url flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.multitenantURL
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.urlRelabelConfig array
|
||||
Optional path to relabel config for the corresponding -remoteWrite.url. The path can point either to local file or to http url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to relabel config for the corresponding -remoteWrite.url. The path can point either to local file or to http url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.urlRelabelDebug array
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. This is useful for debugging the relabeling configs
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. This is useful for debugging the relabeling configs
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-sortLabels
|
||||
Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}Enabled sorting for labels can slow down ingestion performance a bit
|
||||
Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}Enabled sorting for labels can slow down ingestion performance a bit
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower. The provided certificate file is automatically re-read every second, so it can be dynamically updated
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
@@ -24,6 +24,7 @@ var (
|
||||
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol")
|
||||
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field")
|
||||
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
|
||||
dbLabel = flag.String("influxDBLabel", "db", "Default label for the DB name sent over '?db={db_name}' query parameter")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -80,7 +81,7 @@ func insertRows(at *auth.Token, db string, rows []parser.Row, extraLabels []prom
|
||||
hasDBKey := false
|
||||
for j := range r.Tags {
|
||||
tag := &r.Tags[j]
|
||||
if tag.Key == "db" {
|
||||
if tag.Key == *dbLabel {
|
||||
hasDBKey = true
|
||||
}
|
||||
commonLabels = append(commonLabels, prompbmarshal.Label{
|
||||
@@ -90,7 +91,7 @@ func insertRows(at *auth.Token, db string, rows []parser.Row, extraLabels []prom
|
||||
}
|
||||
if len(db) > 0 && !hasDBKey {
|
||||
commonLabels = append(commonLabels, prompbmarshal.Label{
|
||||
Name: "db",
|
||||
Name: *dbLabel,
|
||||
Value: db,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ var (
|
||||
configAuthKey = flag.String("configAuthKey", "", "Authorization key for accessing /config page. It must be passed via authKey query arg")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
|
||||
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . "+
|
||||
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
|
||||
"Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -154,6 +154,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
if r.Method != "GET" {
|
||||
return false
|
||||
}
|
||||
w.Header().Add("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprintf(w, "<h2>vmagent</h2>")
|
||||
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/vmagent.html'>https://docs.victoriametrics.com/vmagent.html</a></br>")
|
||||
fmt.Fprintf(w, "Useful endpoints:</br>")
|
||||
@@ -262,6 +263,14 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
promscrapeTargetsRequests.Inc()
|
||||
promscrape.WriteHumanReadableTargetsStatus(w, r)
|
||||
return true
|
||||
case "/target_response":
|
||||
promscrapeTargetResponseRequests.Inc()
|
||||
if err := promscrape.WriteTargetResponse(w, r); err != nil {
|
||||
promscrapeTargetResponseErrors.Inc()
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
return true
|
||||
case "/config":
|
||||
if *configAuthKey != "" && r.FormValue("authKey") != *configAuthKey {
|
||||
err := &httpserver.ErrorWithStatusCode{
|
||||
@@ -443,6 +452,9 @@ var (
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
|
||||
promscrapeAPIV1TargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/targets"}`)
|
||||
|
||||
promscrapeTargetResponseRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/target_response"}`)
|
||||
promscrapeTargetResponseErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/target_response"}`)
|
||||
|
||||
promscrapeConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/config"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
@@ -30,8 +30,9 @@ func InsertHandler(at *auth.Token, req *http.Request) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
isGzip := req.Header.Get("Content-Encoding") == "gzip"
|
||||
return writeconcurrencylimiter.Do(func() error {
|
||||
return parser.ParseStream(req, func(block *parser.Block) error {
|
||||
return parser.ParseStream(req.Body, isGzip, func(block *parser.Block) error {
|
||||
return insertRows(at, block, extraLabels)
|
||||
})
|
||||
})
|
||||
|
||||
@@ -92,9 +92,9 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste
|
||||
}
|
||||
tlsCfg := authCfg.NewTLSConfig()
|
||||
tr := &http.Transport{
|
||||
Dial: statDial,
|
||||
DialContext: statDial,
|
||||
TLSClientConfig: tlsCfg,
|
||||
TLSHandshakeTimeout: 5 * time.Second,
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
MaxConnsPerHost: 2 * concurrency,
|
||||
MaxIdleConnsPerHost: 2 * concurrency,
|
||||
IdleConnTimeout: time.Minute,
|
||||
@@ -295,6 +295,17 @@ again:
|
||||
}
|
||||
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
|
||||
if statusCode == 409 || statusCode == 400 {
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
l := logger.WithThrottler("remoteWriteRejected", 5*time.Second)
|
||||
if err != nil {
|
||||
l.Errorf("sending a block with size %d bytes to %q was rejected (skipping the block): status code %d; "+
|
||||
"failed to read response body: %s",
|
||||
len(block), c.sanitizedURL, statusCode, err)
|
||||
} else {
|
||||
l.Errorf("sending a block with size %d bytes to %q was rejected (skipping the block): status code %d; response body: %s",
|
||||
len(block), c.sanitizedURL, statusCode, string(body))
|
||||
}
|
||||
// Just drop block on 409 and 400 status codes like Prometheus does.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
|
||||
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
@@ -210,7 +211,22 @@ func pushWriteRequest(wr *prompbmarshal.WriteRequest, pushBlock func(block []byt
|
||||
writeRequestBufPool.Put(bb)
|
||||
}
|
||||
|
||||
// Too big block. Recursively split it into smaller parts.
|
||||
// Too big block. Recursively split it into smaller parts if possible.
|
||||
if len(wr.Timeseries) == 1 {
|
||||
// A single time series left. Recursively split its samples into smaller parts if possible.
|
||||
samples := wr.Timeseries[0].Samples
|
||||
if len(samples) == 1 {
|
||||
logger.Warnf("dropping a sample for metric with too long labels exceeding -remoteWrite.maxBlockSize=%d bytes", maxUnpackedBlockSize.N)
|
||||
return
|
||||
}
|
||||
n := len(samples) / 2
|
||||
wr.Timeseries[0].Samples = samples[:n]
|
||||
pushWriteRequest(wr, pushBlock)
|
||||
wr.Timeseries[0].Samples = samples[n:]
|
||||
pushWriteRequest(wr, pushBlock)
|
||||
wr.Timeseries[0].Samples = samples
|
||||
return
|
||||
}
|
||||
timeseries := wr.Timeseries
|
||||
n := len(timeseries) / 2
|
||||
wr.Timeseries = timeseries[:n]
|
||||
|
||||
@@ -390,6 +390,8 @@ var labelsHashBufPool bytesutil.ByteBufferPool
|
||||
func logSkippedSeries(labels []prompbmarshal.Label, flagName string, flagValue int) {
|
||||
select {
|
||||
case <-logSkippedSeriesTicker.C:
|
||||
// Do not use logger.WithThrottler() here, since this will increase CPU usage
|
||||
// because every call to logSkippedSeries will result to a call to labelsToString.
|
||||
logger.Warnf("skip series %s because %s=%d reached", labelsToString(labels), flagName, flagValue)
|
||||
default:
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@@ -9,9 +11,26 @@ import (
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
func statDial(networkUnused, addr string) (conn net.Conn, err error) {
|
||||
func getStdDialer() *net.Dialer {
|
||||
stdDialerOnce.Do(func() {
|
||||
stdDialer = &net.Dialer{
|
||||
Timeout: 30 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
DualStack: netutil.TCP6Enabled(),
|
||||
}
|
||||
})
|
||||
return stdDialer
|
||||
}
|
||||
|
||||
var (
|
||||
stdDialer *net.Dialer
|
||||
stdDialerOnce sync.Once
|
||||
)
|
||||
|
||||
func statDial(ctx context.Context, networkUnused, addr string) (conn net.Conn, err error) {
|
||||
network := netutil.GetTCPNetwork()
|
||||
conn, err = net.DialTimeout(network, addr, 5*time.Second)
|
||||
d := getStdDialer()
|
||||
conn, err = d.DialContext(ctx, network, addr)
|
||||
dialsTotal.Inc()
|
||||
if err != nil {
|
||||
dialErrors.Inc()
|
||||
|
||||
@@ -27,6 +27,15 @@ vmalert-ppc64le-prod:
|
||||
vmalert-386-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-386
|
||||
|
||||
vmalert-darwin-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmalert-darwin-arm64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmalert-windows-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmalert:
|
||||
APP_NAME=vmalert $(MAKE) package-via-docker
|
||||
|
||||
@@ -70,6 +79,13 @@ run-vmalert: vmalert
|
||||
-evaluationInterval=3s \
|
||||
-rule.configCheckInterval=10s
|
||||
|
||||
run-vmalert-sd: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-notifier.config=app/vmalert/notifier/testdata/consul.good.yaml \
|
||||
-configCheckInterval=10s
|
||||
|
||||
replay-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules-replay-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
@@ -102,6 +118,3 @@ vmalert-pure:
|
||||
|
||||
vmalert-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmalert $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmalert-windows-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -3,27 +3,29 @@
|
||||
`vmalert` executes a list of the given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured `-datasource.url`. For sending alerting notifications
|
||||
vmalert relies on [Alertmanager]((https://github.com/prometheus/alertmanager)) configured via `-notifier.url` flag.
|
||||
vmalert relies on [Alertmanager](https://github.com/prometheus/alertmanager) configured via `-notifier.url` flag.
|
||||
Recording rules results are persisted via [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
protocol and require `-remoteWrite.url` to be configured.
|
||||
Vmalert is heavily inspired by [Prometheus](https://prometheus.io/docs/alerting/latest/overview/)
|
||||
implementation and aims to be compatible with its syntax.
|
||||
|
||||
## Features
|
||||
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html)
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager) starting from [Alertmanager v0.16.0-aplha](https://github.com/prometheus/alertmanager/releases/tag/v0.16.0-alpha.0);
|
||||
* Keeps the alerts [state on restarts](#alerts-state-on-restarts);
|
||||
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite);
|
||||
* Recording and Alerting rules backfilling (aka `replay`). See [these docs](#rules-backfilling);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
## Limitations
|
||||
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of the network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with the understanding that network
|
||||
It is recommended to configure alerts thresholds and rules expressions with the understanding that network
|
||||
requests may fail;
|
||||
* by default, rules execution is sequential within one group, but persistence of execution results to remote
|
||||
storage is asynchronous. Hence, user shouldn't rely on chaining of recording rules when result of previous
|
||||
@@ -32,24 +34,29 @@ recording rule is reused in the next one;
|
||||
## QuickStart
|
||||
|
||||
To build `vmalert` from sources:
|
||||
```
|
||||
|
||||
```bash
|
||||
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
|
||||
cd VictoriaMetrics
|
||||
make vmalert
|
||||
```
|
||||
|
||||
The build binary will be placed in `VictoriaMetrics/bin` folder.
|
||||
|
||||
To start using `vmalert` you will need the following things:
|
||||
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable MetricsQL endpoint to run queries against;
|
||||
* notifier address [optional] - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts, and sending notifications.
|
||||
aggregating alerts, and sending notifications. Please note, notifier address also supports Consul Service Discovery via
|
||||
[config file](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/notifier/config.go).
|
||||
* remote write address [optional] - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage to persist rules and alerts state info;
|
||||
* remote read address [optional] - MetricsQL compatible datasource to restore alerts state from.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
|
||||
```bash
|
||||
./bin/vmalert -rule=alert.rules \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
|
||||
-notifier.url=http://localhost:9093 \ # AlertManager URL (required if alerting rules are used)
|
||||
@@ -76,6 +83,7 @@ and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerti
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to a `group` and every configuration file may contain arbitrary number of groups:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
@@ -84,6 +92,7 @@ groups:
|
||||
### Groups
|
||||
|
||||
Each group has the following attributes:
|
||||
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
@@ -135,6 +144,7 @@ or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expression. Vmal
|
||||
expression and then act according to the Rule type.
|
||||
|
||||
There are two types of Rules:
|
||||
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allow defining alert conditions via `expr` field and to send notifications to
|
||||
[Alertmanager](https://github.com/prometheus/alertmanager) if execution result is not empty.
|
||||
@@ -149,6 +159,7 @@ within one group.
|
||||
#### Alerting rules
|
||||
|
||||
The syntax for alerting rule is the following:
|
||||
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
@@ -181,6 +192,7 @@ listed [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app
|
||||
#### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
|
||||
```yaml
|
||||
# The name of the time series to output to. Must be a valid metric name.
|
||||
record: <string>
|
||||
@@ -197,11 +209,11 @@ labels:
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must be specified.
|
||||
|
||||
|
||||
### Alerts state on restarts
|
||||
|
||||
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after restart of `vmalert`
|
||||
the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags:
|
||||
|
||||
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state
|
||||
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
|
||||
These are regular time series and maybe queried from VM just as any other time series.
|
||||
@@ -213,7 +225,6 @@ Both flags are required for proper state restoration. Restore process may fail i
|
||||
in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`)
|
||||
or received state doesn't match current `vmalert` rules configuration.
|
||||
|
||||
|
||||
### Multitenancy
|
||||
|
||||
There are the following approaches exist for alerting and recording rules across
|
||||
@@ -229,7 +240,7 @@ There are the following approaches exist for alerting and recording rules across
|
||||
rules to `AccountID=123`.
|
||||
|
||||
* To specify `tenant` parameter per each alerting and recording group if
|
||||
[enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used
|
||||
[enterprise version of vmalert](https://victoriametrics.com/products/enterprise/) is used
|
||||
with `-clusterMode` command-line flag. For example:
|
||||
|
||||
```yaml
|
||||
@@ -258,10 +269,11 @@ tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
|
||||
|
||||
### Topology examples
|
||||
|
||||
The following sections are showing how `vmalert` may be used and configured
|
||||
for different scenarios.
|
||||
The following sections are showing how `vmalert` may be used and configured
|
||||
for different scenarios.
|
||||
|
||||
Please note, not all flags in examples are required:
|
||||
|
||||
Please note, not all flags in examples are required:
|
||||
* `-remoteWrite.url` and `-remoteRead.url` are optional and are needed only if
|
||||
you have recording rules or want to store [alerts state](#alerts-state-on-restarts) on `vmalert` restarts;
|
||||
* `-notifier.url` is optional and is needed only if you have alerting rules.
|
||||
@@ -272,6 +284,7 @@ The simplest configuration where one single-node VM server is used for
|
||||
rules execution, storing recording rules results and alerts state.
|
||||
|
||||
`vmalert` configuration flags:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=rules.yml \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://victoriametrics:8428 \ # VM-single addr for executing rules expressions
|
||||
@@ -282,20 +295,20 @@ rules execution, storing recording rules results and alerts state.
|
||||
|
||||
<img alt="vmalert single" width="500" src="vmalert_single.png">
|
||||
|
||||
|
||||
#### Cluster VictoriaMetrics
|
||||
|
||||
In [cluster mode](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html)
|
||||
VictoriaMetrics has separate components for writing and reading path:
|
||||
`vminsert` and `vmselect` components respectively. `vmselect` is used for executing rules expressions
|
||||
and `vminsert` is used to persist recording rules results and alerts state.
|
||||
Cluster mode could have multiple `vminsert` and `vmselect` components.
|
||||
Cluster mode could have multiple `vminsert` and `vmselect` components.
|
||||
|
||||
`vmalert` configuration flags:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=rules.yml \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://vmselect:8481/select/0/prometheus # vmselect addr for executing rules expressions
|
||||
-remoteWrite.url=http://vminsert:8480/insert/0/prometheuss # vminsert addr to persist alerts state and recording rules results
|
||||
-remoteWrite.url=http://vminsert:8480/insert/0/prometheus # vminsert addr to persist alerts state and recording rules results
|
||||
-remoteRead.url=http://vmselect:8481/select/0/prometheus # vmselect addr for restoring alerts state after restart
|
||||
-notifier.url=http://alertmanager:9093 # AlertManager addr to send alerts when they trigger
|
||||
```
|
||||
@@ -314,6 +327,7 @@ the same destinations, and send alert notifications to multiple configured
|
||||
Alertmanagers.
|
||||
|
||||
`vmalert` configuration flags:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=rules.yml \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://victoriametrics:8428 \ # VM-single addr for executing rules expressions
|
||||
@@ -334,13 +348,12 @@ all `vmalert`s are having the same config.
|
||||
Don't forget to configure [cluster mode](https://prometheus.io/docs/alerting/latest/alertmanager/)
|
||||
for Alertmanagers for better reliability.
|
||||
|
||||
This example uses single-node VM server for the sake of simplicity.
|
||||
Check how to replace it with [cluster VictoriaMetrics](#cluster-victoriametrics) if needed.
|
||||
|
||||
This example uses single-node VM server for the sake of simplicity.
|
||||
Check how to replace it with [cluster VictoriaMetrics](#cluster-victoriametrics) if needed.
|
||||
|
||||
#### Downsampling and aggregation via vmalert
|
||||
|
||||
Example shows how to build a topology where `vmalert` will process data from one cluster
|
||||
The following example shows how to build a topology where `vmalert` will process data from one cluster
|
||||
and write results into another. Such clusters may be called as "hot" (low retention,
|
||||
high-speed disks, used for operative monitoring) and "cold" (long term retention,
|
||||
slower/cheaper disks, low resolution data). With help of `vmalert`, user can setup
|
||||
@@ -348,10 +361,11 @@ recording rules to process raw data from "hot" cluster (by applying additional t
|
||||
or reducing resolution) and push results to "cold" cluster.
|
||||
|
||||
`vmalert` configuration flags:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=downsampling-rules.yml \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://raw-cluster-vmselect:8481/select/0/prometheus # vmselect addr for executing recordi ng rules expressions
|
||||
-remoteWrite.url=http://aggregated-cluster-vminsert:8480/insert/0/prometheuss # vminsert addr to persist recording rules results
|
||||
-remoteWrite.url=http://aggregated-cluster-vminsert:8480/insert/0/prometheus # vminsert addr to persist recording rules results
|
||||
```
|
||||
|
||||
<img alt="vmalert multi cluster" src="vmalert_multicluster.png">
|
||||
@@ -360,19 +374,20 @@ Please note, [replay](#rules-backfilling) feature may be used for transforming h
|
||||
|
||||
Flags `-remoteRead.url` and `-notifier.url` are omitted since we assume only recording rules are used.
|
||||
|
||||
See also [downsampling docs](https://docs.victoriametrics.com/#downsampling).
|
||||
|
||||
### Web
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
|
||||
* `http://<vmalert-addr>` - UI;
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/rules` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupID>/<alertID>/status" ` - get alert status by ID.
|
||||
* `http://<vmalert-addr>/api/v1/<groupID>/<alertID>/status"` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
||||
## Graphite
|
||||
|
||||
vmalert sends requests to `<-datasource.url>/render?format=json` during evaluation of alerting and recording rules
|
||||
@@ -392,6 +407,7 @@ data source for backfilling.
|
||||
|
||||
In `replay` mode vmalert works as a cli-tool and exits immediately after work is done.
|
||||
To run vmalert in `replay` mode:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=path/to/your.rules \ # path to files with rules you usually use with vmalert
|
||||
-datasource.url=http://localhost:8428 \ # PromQL/MetricsQL compatible datasource
|
||||
@@ -401,6 +417,7 @@ To run vmalert in `replay` mode:
|
||||
```
|
||||
|
||||
The output of the command will look like the following:
|
||||
|
||||
```
|
||||
Replay mode:
|
||||
from: 2021-05-11 07:21:43 +0000 UTC # set by -replay.timeFrom
|
||||
@@ -444,9 +461,11 @@ The result of recording rules `replay` should match with results of normal rules
|
||||
|
||||
The result of alerting rules `replay` is time series reflecting [alert's state](#alerts-state-on-restarts).
|
||||
To see if `replayed` alert has fired in the past use the following PromQL/MetricsQL expression:
|
||||
|
||||
```
|
||||
ALERTS{alertname="your_alertname", alertstate="firing"}
|
||||
```
|
||||
|
||||
Execute the query against storage which was used for `-remoteWrite.url` during the `replay`.
|
||||
|
||||
### Additional configuration
|
||||
@@ -470,7 +489,6 @@ See full description for these flags in `./vmalert --help`.
|
||||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
@@ -481,7 +499,6 @@ Use the official [Grafana dashboard](https://grafana.com/grafana/dashboards/1495
|
||||
If you have suggestions for improvements or have found a bug - please open an issue on github or add
|
||||
a review to the dashboard.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
### Flags
|
||||
@@ -490,228 +507,313 @@ Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
|
||||
The shortlist of configuration flags is the following:
|
||||
|
||||
```
|
||||
-clusterMode
|
||||
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy
|
||||
-configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' or '-notifier.config' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes.
|
||||
-datasource.appendTypePrefix
|
||||
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
|
||||
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
|
||||
-datasource.basicAuth.password string
|
||||
Optional basic auth password for -datasource.url
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.passwordFile string
|
||||
Optional path to basic auth password to use for -datasource.url
|
||||
Optional path to basic auth password to use for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.bearerToken string
|
||||
Optional bearer auth token to use for -datasource.url.
|
||||
Optional bearer auth token to use for -datasource.url.
|
||||
-datasource.bearerTokenFile string
|
||||
Optional path to bearer token file to use for -datasource.url.
|
||||
Optional path to bearer token file to use for -datasource.url.
|
||||
-datasource.disableKeepAlive
|
||||
Whether to disable long-lived connections to the datasource. If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.
|
||||
-datasource.lookback duration
|
||||
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
-datasource.maxIdleConnections int
|
||||
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
|
||||
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
|
||||
-datasource.oauth2.clientID string
|
||||
Optional OAuth2 clientID to use for -datasource.url.
|
||||
-datasource.oauth2.clientSecret string
|
||||
Optional OAuth2 clientSecret to use for -datasource.url.
|
||||
-datasource.oauth2.clientSecretFile string
|
||||
Optional OAuth2 clientSecretFile to use for -datasource.url.
|
||||
-datasource.oauth2.scopes string
|
||||
Optional OAuth2 scopes to use for -datasource.url. Scopes must be delimited by ';'
|
||||
-datasource.oauth2.tokenUrl string
|
||||
Optional OAuth2 tokenURL to use for -datasource.url.
|
||||
-datasource.queryStep duration
|
||||
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.If queryStep isn't specified, rule's evaluationInterval will be used instead.
|
||||
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.If queryStep isn't specified, rule's evaluationInterval will be used instead.
|
||||
-datasource.queryTimeAlignment
|
||||
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite of number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
|
||||
-datasource.roundDigits int
|
||||
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
|
||||
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
|
||||
-datasource.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
|
||||
-datasource.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
|
||||
-datasource.tlsServerName string
|
||||
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
|
||||
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
|
||||
-datasource.url string
|
||||
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-defaultTenant.graphite string
|
||||
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy
|
||||
-defaultTenant.prometheus string
|
||||
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy
|
||||
-disableAlertgroupLabel
|
||||
Whether to disable adding group's name as label to generated alerts and time series.
|
||||
Whether to disable adding group's Name as label to generated alerts and time series.
|
||||
-dryRun -rule
|
||||
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
|
||||
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.alert.source string
|
||||
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
|
||||
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
|
||||
-external.label array
|
||||
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional label in the form 'Name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
Address to listen for http connections (default ":8880")
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-notifier.basicAuth.password array
|
||||
Optional basic auth password for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth password for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.basicAuth.passwordFile array
|
||||
Optional path to basic auth password file for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.basicAuth.username array
|
||||
Optional basic auth username for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth username for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.bearerToken array
|
||||
Optional bearer token for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.bearerTokenFile array
|
||||
Optional path to bearer token file for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.config string
|
||||
Path to configuration file for notifiers
|
||||
-notifier.oauth2.clientID array
|
||||
Optional OAuth2 clientID to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.oauth2.clientSecret array
|
||||
Optional OAuth2 clientSecret to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.oauth2.clientSecretFile array
|
||||
Optional OAuth2 clientSecretFile to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.oauth2.scopes array
|
||||
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.oauth2.tokenUrl array
|
||||
Optional OAuth2 tokenURL to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.suppressDuplicateTargetErrors
|
||||
Whether to suppress 'duplicate target' errors during discovery
|
||||
-notifier.tlsCAFile array
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsCertFile array
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsInsecureSkipVerify array
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsKeyFile array
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsServerName array
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.url array
|
||||
Prometheus alertmanager URL, e.g. http://127.0.0.1:9093
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Prometheus alertmanager URL, e.g. http://127.0.0.1:9093
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
|
||||
-promscrape.discovery.concurrency int
|
||||
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
|
||||
-promscrape.discovery.concurrentWaitTime duration
|
||||
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.passwordFile string
|
||||
Optional path to basic auth password to use for -remoteRead.url
|
||||
Optional path to basic auth password to use for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.bearerToken string
|
||||
Optional bearer auth token to use for -remoteRead.url.
|
||||
Optional bearer auth token to use for -remoteRead.url.
|
||||
-remoteRead.bearerTokenFile string
|
||||
Optional path to bearer token file to use for -remoteRead.url.
|
||||
Optional path to bearer token file to use for -remoteRead.url.
|
||||
-remoteRead.disablePathAppend
|
||||
Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url.
|
||||
Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url.
|
||||
-remoteRead.ignoreRestoreErrors
|
||||
Whether to ignore errors from remote storage when restoring alerts state on startup. (default true)
|
||||
Whether to ignore errors from remote storage when restoring alerts state on startup. (default true)
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteRead.oauth2.clientID string
|
||||
Optional OAuth2 clientID to use for -remoteRead.url.
|
||||
-remoteRead.oauth2.clientSecret string
|
||||
Optional OAuth2 clientSecret to use for -remoteRead.url.
|
||||
-remoteRead.oauth2.clientSecretFile string
|
||||
Optional OAuth2 clientSecretFile to use for -remoteRead.url.
|
||||
-remoteRead.oauth2.scopes string
|
||||
Optional OAuth2 scopes to use for -remoteRead.url. Scopes must be delimited by ';'.
|
||||
-remoteRead.oauth2.tokenUrl string
|
||||
Optional OAuth2 tokenURL to use for -remoteRead.url.
|
||||
-remoteRead.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
|
||||
-remoteRead.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
|
||||
-remoteRead.tlsServerName string
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428. See also -remoteRead.disablePathAppend
|
||||
Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428. See also -remoteRead.disablePathAppend
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.passwordFile string
|
||||
Optional path to basic auth password to use for -remoteWrite.url
|
||||
Optional path to basic auth password to use for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.bearerToken string
|
||||
Optional bearer auth token to use for -remoteWrite.url.
|
||||
Optional bearer auth token to use for -remoteWrite.url.
|
||||
-remoteWrite.bearerTokenFile string
|
||||
Optional path to bearer token file to use for -remoteWrite.url.
|
||||
Optional path to bearer token file to use for -remoteWrite.url.
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of writers for concurrent writing into remote querier (default 1)
|
||||
Defines number of writers for concurrent writing into remote querier (default 1)
|
||||
-remoteWrite.disablePathAppend
|
||||
Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.
|
||||
Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.oauth2.clientID string
|
||||
Optional OAuth2 clientID to use for -remoteWrite.url.
|
||||
-remoteWrite.oauth2.clientSecret string
|
||||
Optional OAuth2 clientSecret to use for -remoteWrite.url.
|
||||
-remoteWrite.oauth2.clientSecretFile string
|
||||
Optional OAuth2 clientSecretFile to use for -remoteWrite.url.
|
||||
-remoteWrite.oauth2.scopes string
|
||||
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
|
||||
-remoteWrite.oauth2.tokenUrl string
|
||||
Optional OAuth2 tokenURL to use for -notifier.url.
|
||||
-remoteWrite.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
|
||||
-remoteWrite.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsServerName string
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
|
||||
-remoteWrite.url string
|
||||
Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend
|
||||
Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend
|
||||
-replay.maxDatapointsPerQuery int
|
||||
Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000)
|
||||
Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000)
|
||||
-replay.ruleRetryAttempts int
|
||||
Defines how many retries to make before giving up on rule if request for it returns an error. (default 5)
|
||||
Defines how many retries to make before giving up on rule if request for it returns an error. (default 5)
|
||||
-replay.rulesDelay duration
|
||||
Delay between rules evaluation within the group. Could be important if there are chained rules inside of the groupand processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule.Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
|
||||
Delay between rules evaluation within the group. Could be important if there are chained rules inside of the groupand processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule.Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
|
||||
-replay.timeFrom string
|
||||
The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
-replay.timeTo string
|
||||
The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
-rule array
|
||||
Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule="/path/to/file". Path to a single file with alerting rules
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule="/path/to/file". Path to a single file with alerting rules
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-rule.configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
|
||||
-rule.maxResolveDuration duration
|
||||
Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group.
|
||||
Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group.
|
||||
-rule.resendDelay duration
|
||||
Minimum amount of time to wait before resending an alert to notifier
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Whether to validate annotation and label templates (default true)
|
||||
Whether to validate annotation and label templates (default true)
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower. The provided certificate file is automatically re-read every second, so it can be dynamically updated
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
### Hot config reload
|
||||
|
||||
`vmalert` supports "hot" config reload via the following methods:
|
||||
|
||||
* send SIGHUP signal to `vmalert` process;
|
||||
* send GET request to `/-/reload` endpoint;
|
||||
* configure `-rule.configCheckInterval` flag for periodic reload
|
||||
* configure `-configCheckInterval` flag for periodic reload
|
||||
on config change.
|
||||
|
||||
### URL params
|
||||
@@ -721,6 +823,7 @@ just add them in address: `-datasource.url=http://localhost:8428?nocache=1`.
|
||||
|
||||
To set additional URL params for specific [group of rules](#Groups) modify
|
||||
the `params` group:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: TestGroup
|
||||
@@ -728,10 +831,95 @@ groups:
|
||||
denyPartialResponse: ["true"]
|
||||
extra_label: ["env=dev"]
|
||||
```
|
||||
|
||||
Please note, `params` are used only for executing rules expressions (requests to `datasource.url`).
|
||||
If there would be a conflict between URL params set in `datasource.url` flag and params in group definition
|
||||
the latter will have higher priority.
|
||||
|
||||
### Notifier configuration file
|
||||
|
||||
Notifier also supports configuration via file specified with flag `notifier.config`:
|
||||
|
||||
```
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules.good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.config=app/vmalert/notifier/testdata/consul.good.yaml
|
||||
```
|
||||
|
||||
The configuration file allows to configure static notifiers or discover notifiers via
|
||||
[Consul](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config).
|
||||
For example:
|
||||
|
||||
```
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
services:
|
||||
- alertmanager
|
||||
```
|
||||
|
||||
The list of configured or discovered Notifiers can be explored via [UI](#Web).
|
||||
|
||||
The configuration file [specification](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/notifier/config.go)
|
||||
is the following:
|
||||
|
||||
```
|
||||
# Per-target Notifier timeout when pushing alerts.
|
||||
[ timeout: <duration> | default = 10s ]
|
||||
|
||||
# Prefix for the HTTP path alerts are pushed to.
|
||||
[ path_prefix: <path> | default = / ]
|
||||
|
||||
# Configures the protocol scheme used for requests.
|
||||
[ scheme: <scheme> | default = http ]
|
||||
|
||||
# Sets the `Authorization` header on every request with the
|
||||
# configured username and password.
|
||||
# password and password_file are mutually exclusive.
|
||||
basic_auth:
|
||||
[ username: <string> ]
|
||||
[ password: <secret> ]
|
||||
[ password_file: <string> ]
|
||||
|
||||
# Optional `Authorization` header configuration.
|
||||
authorization:
|
||||
# Sets the authentication type.
|
||||
[ type: <string> | default: Bearer ]
|
||||
# Sets the credentials. It is mutually exclusive with
|
||||
# `credentials_file`.
|
||||
[ credentials: <secret> ]
|
||||
# Sets the credentials to the credentials read from the configured file.
|
||||
# It is mutually exclusive with `credentials`.
|
||||
[ credentials_file: <filename> ]
|
||||
|
||||
# Configures the scrape request's TLS settings.
|
||||
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config
|
||||
tls_config:
|
||||
[ <tls_config> ]
|
||||
|
||||
# List of labeled statically configured Notifiers.
|
||||
static_configs:
|
||||
targets:
|
||||
[ - '<host>' ]
|
||||
|
||||
# List of Consul service discovery configurations.
|
||||
# See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config
|
||||
consul_sd_configs:
|
||||
[ - <consul_sd_config> ... ]
|
||||
|
||||
# List of relabel configurations.
|
||||
# Supports the same relabeling features as the rest of VictoriaMetrics components.
|
||||
# See https://docs.victoriametrics.com/vmagent.html#relabeling
|
||||
relabel_configs:
|
||||
[ - <relabel_config> ... ]
|
||||
|
||||
```
|
||||
|
||||
The configuration file can be [hot-reloaded](#hot-config-reload).
|
||||
|
||||
## Contributing
|
||||
|
||||
@@ -743,8 +931,8 @@ software. Please keep simplicity as the main priority.
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
* `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
### Development build
|
||||
|
||||
@@ -758,7 +946,6 @@ It is recommended using
|
||||
2. Run `make vmalert-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
|
||||
### ARM build
|
||||
|
||||
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
|
||||
|
||||
@@ -12,9 +12,9 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
@@ -38,6 +38,8 @@ type AlertingRule struct {
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores the duration of the last Exec call
|
||||
lastExecDuration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
@@ -50,10 +52,10 @@ type AlertingRule struct {
|
||||
}
|
||||
|
||||
type alertingRuleMetrics struct {
|
||||
errors *gauge
|
||||
pending *gauge
|
||||
active *gauge
|
||||
samples *gauge
|
||||
errors *utils.Gauge
|
||||
pending *utils.Gauge
|
||||
active *utils.Gauge
|
||||
samples *utils.Gauge
|
||||
}
|
||||
|
||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
@@ -78,7 +80,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
||||
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
@@ -90,7 +92,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
}
|
||||
return float64(num)
|
||||
})
|
||||
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
||||
ar.metrics.active = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
@@ -102,7 +104,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
}
|
||||
return float64(num)
|
||||
})
|
||||
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
@@ -111,7 +113,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
}
|
||||
return 1
|
||||
})
|
||||
ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
@@ -122,10 +124,10 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (ar *AlertingRule) Close() {
|
||||
metrics.UnregisterMetric(ar.metrics.active.name)
|
||||
metrics.UnregisterMetric(ar.metrics.pending.name)
|
||||
metrics.UnregisterMetric(ar.metrics.errors.name)
|
||||
metrics.UnregisterMetric(ar.metrics.samples.name)
|
||||
ar.metrics.active.Unregister()
|
||||
ar.metrics.pending.Unregister()
|
||||
ar.metrics.errors.Unregister()
|
||||
ar.metrics.samples.Unregister()
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
@@ -139,6 +141,53 @@ func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
type labelSet struct {
|
||||
// origin labels from series
|
||||
// used for templating
|
||||
origin map[string]string
|
||||
// processed labels with additional data
|
||||
// used as Alert labels
|
||||
processed map[string]string
|
||||
}
|
||||
|
||||
// toLabels converts labels from given Metric
|
||||
// to labelSet which contains original and processed labels.
|
||||
func (ar *AlertingRule) toLabels(m datasource.Metric, qFn notifier.QueryFn) (*labelSet, error) {
|
||||
ls := &labelSet{
|
||||
origin: make(map[string]string, len(m.Labels)),
|
||||
processed: make(map[string]string),
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
ls.origin[l.Name] = l.Value
|
||||
ls.processed[l.Name] = l.Value
|
||||
}
|
||||
|
||||
extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{
|
||||
Labels: ls.origin,
|
||||
Value: m.Values[0],
|
||||
Expr: ar.Expr,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
for k, v := range extraLabels {
|
||||
ls.processed[k] = v
|
||||
}
|
||||
|
||||
// set additional labels to identify group and rule name
|
||||
if ar.Name != "" {
|
||||
ls.processed[alertNameLabel] = ar.Name
|
||||
}
|
||||
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
||||
ls.processed[alertGroupNameLabel] = ar.GroupName
|
||||
}
|
||||
return ls, nil
|
||||
}
|
||||
|
||||
// ExecRange executes alerting rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
@@ -153,24 +202,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
||||
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
||||
}
|
||||
for _, s := range series {
|
||||
// set additional labels to identify group and rule name
|
||||
if ar.Name != "" {
|
||||
s.SetLabel(alertNameLabel, ar.Name)
|
||||
}
|
||||
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
||||
s.SetLabel(alertGroupNameLabel, ar.GroupName)
|
||||
}
|
||||
// extra labels could contain templates, so we expand them first
|
||||
labels, err := expandLabels(s, qFn, ar)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
for k, v := range labels {
|
||||
// apply extra labels to datasource
|
||||
// so the hash key will be consistent on restore
|
||||
s.SetLabel(k, v)
|
||||
}
|
||||
a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
|
||||
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
@@ -189,9 +221,10 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
||||
if at.Sub(prevT) > ar.EvalInterval {
|
||||
// reset to Pending if there are gaps > EvalInterval between DPs
|
||||
a.State = notifier.StatePending
|
||||
a.Start = at
|
||||
} else if at.Sub(a.Start) >= ar.For {
|
||||
a.ActiveAt = at
|
||||
} else if at.Sub(a.ActiveAt) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
a.Start = at
|
||||
}
|
||||
prevT = at
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
@@ -200,15 +233,21 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// resolvedRetention is the duration for which a resolved alert instance
|
||||
// is kept in memory state and consequently repeatedly sent to the AlertManager.
|
||||
const resolvedRetention = 15 * time.Minute
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := ar.q.Query(ctx, ar.Expr)
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
qMetrics, err := ar.q.Query(ctx, ar.Expr, ts)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
ar.lastExecTime = start
|
||||
ar.lastExecDuration = time.Since(start)
|
||||
ar.lastExecError = err
|
||||
ar.lastExecTime = time.Now()
|
||||
ar.lastExecSamples = len(qMetrics)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
@@ -216,59 +255,55 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
if a.State == notifier.StateInactive {
|
||||
if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention {
|
||||
delete(ar.alerts, h)
|
||||
}
|
||||
}
|
||||
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query, ts) }
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
// set additional labels to identify group and rule name
|
||||
if ar.Name != "" {
|
||||
m.SetLabel(alertNameLabel, ar.Name)
|
||||
}
|
||||
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
||||
m.SetLabel(alertGroupNameLabel, ar.GroupName)
|
||||
}
|
||||
// extra labels could contain templates, so we expand them first
|
||||
labels, err := expandLabels(m, qFn, ar)
|
||||
ls, err := ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
for k, v := range labels {
|
||||
// apply extra labels to datasource
|
||||
// so the hash key will be consistent on restore
|
||||
m.SetLabel(k, v)
|
||||
}
|
||||
h := hash(m)
|
||||
h := hash(ls.processed)
|
||||
if _, ok := updated[h]; ok {
|
||||
// duplicate may be caused by extra labels
|
||||
// conflicting with the metric labels
|
||||
return nil, fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
|
||||
ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, ar.lastExecError
|
||||
}
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.State == notifier.StateInactive {
|
||||
// alert could be in inactive state for resolvedRetention
|
||||
// so when we again receive metrics for it - we switch it
|
||||
// back to notifier.StatePending
|
||||
a.State = notifier.StatePending
|
||||
a.ActiveAt = ts
|
||||
}
|
||||
if a.Value != m.Values[0] {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Values[0]
|
||||
// and re-exec template since Value can be used
|
||||
// in annotations
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ar.lastExecTime, qFn)
|
||||
a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
a.ActiveAt = ts
|
||||
ar.alerts[h] = a
|
||||
}
|
||||
|
||||
@@ -282,28 +317,19 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e
|
||||
delete(ar.alerts, h)
|
||||
continue
|
||||
}
|
||||
a.State = notifier.StateInactive
|
||||
if a.State == notifier.StateFiring {
|
||||
a.State = notifier.StateInactive
|
||||
a.ResolvedAt = ts
|
||||
}
|
||||
continue
|
||||
}
|
||||
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
||||
if a.State == notifier.StatePending && time.Since(a.ActiveAt) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
a.Start = ts
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
return ar.toTimeSeries(ar.lastExecTime.Unix()), nil
|
||||
}
|
||||
|
||||
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
|
||||
metricLabels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
metricLabels[l.Name] = l.Value
|
||||
}
|
||||
tpl := notifier.AlertTplData{
|
||||
Labels: metricLabels,
|
||||
Value: m.Values[0],
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
return notifier.ExecTemplate(q, ar.Labels, tpl)
|
||||
return ar.toTimeSeries(ts.Unix()), nil
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
|
||||
@@ -336,42 +362,43 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(m datasource.Metric) uint64 {
|
||||
func hash(labels map[string]string) uint64 {
|
||||
hash := fnv.New64a()
|
||||
labels := m.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, k := range keys {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
if k == "__name__" {
|
||||
continue
|
||||
}
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
name, value := k, labels[k]
|
||||
hash.Write([]byte(name))
|
||||
hash.Write([]byte(value))
|
||||
hash.Write([]byte("\xff"))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Values[0],
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
a.Labels[l.Name] = l.Value
|
||||
}
|
||||
func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
|
||||
var err error
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
|
||||
if ls == nil {
|
||||
ls, err = ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
}
|
||||
a := ¬ifier.Alert{
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: ls.processed,
|
||||
Value: m.Values[0],
|
||||
ActiveAt: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
|
||||
return a, err
|
||||
}
|
||||
|
||||
@@ -386,34 +413,54 @@ func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIAlertingRule
|
||||
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
||||
var lastErr string
|
||||
// ToAPI returns Rule representation in form
|
||||
// of APIRule
|
||||
func (ar *AlertingRule) ToAPI() APIRule {
|
||||
r := APIRule{
|
||||
Type: "alerting",
|
||||
DatasourceType: ar.Type.String(),
|
||||
Name: ar.Name,
|
||||
Query: ar.Expr,
|
||||
Duration: ar.For.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: ar.lastExecTime,
|
||||
EvaluationTime: ar.lastExecDuration.Seconds(),
|
||||
Health: "ok",
|
||||
State: "inactive",
|
||||
Alerts: ar.AlertsToAPI(),
|
||||
LastSamples: ar.lastExecSamples,
|
||||
|
||||
// encode as strings to avoid rounding in JSON
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
}
|
||||
if ar.lastExecError != nil {
|
||||
lastErr = ar.lastExecError.Error()
|
||||
r.LastError = ar.lastExecError.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return APIAlertingRule{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
Type: ar.Type.String(),
|
||||
Name: ar.Name,
|
||||
Expression: ar.Expr,
|
||||
For: ar.For.String(),
|
||||
LastError: lastErr,
|
||||
LastSamples: ar.lastExecSamples,
|
||||
LastExec: ar.lastExecTime,
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
// satisfy APIRule.State logic
|
||||
if len(r.Alerts) > 0 {
|
||||
r.State = notifier.StatePending.String()
|
||||
stateFiring := notifier.StateFiring.String()
|
||||
for _, a := range r.Alerts {
|
||||
if a.State == stateFiring {
|
||||
r.State = stateFiring
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// AlertsAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
||||
// AlertsToAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
@@ -432,7 +479,7 @@ func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.Start,
|
||||
ActiveAt: a.ActiveAt,
|
||||
Restored: a.Restored,
|
||||
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
|
||||
}
|
||||
@@ -458,7 +505,7 @@ const (
|
||||
alertGroupNameLabel = "alertgroup"
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to time series
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(a, timestamp))
|
||||
@@ -486,11 +533,11 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
return newTimeSeries([]float64{float64(a.Start.Unix())}, []int64{timestamp}, labels)
|
||||
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written time series.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// Restore restores only ActiveAt field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||
@@ -498,7 +545,8 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
||||
ts := time.Now()
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query, ts) }
|
||||
|
||||
// account for external labels in filter
|
||||
var labelsFilter string
|
||||
@@ -511,21 +559,61 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
|
||||
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
qMetrics, err := q.Query(ctx, expr, ts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
|
||||
ls := &labelSet{
|
||||
origin: make(map[string]string, len(m.Labels)),
|
||||
processed: make(map[string]string, len(m.Labels)),
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
if l.Name == "__name__" {
|
||||
continue
|
||||
}
|
||||
ls.origin[l.Name] = l.Value
|
||||
ls.processed[l.Name] = l.Value
|
||||
}
|
||||
a, err := ar.newAlert(m, ls, time.Unix(int64(m.Values[0]), 0), qFn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
a.ID = hash(m)
|
||||
a.ID = hash(ls.processed)
|
||||
a.State = notifier.StatePending
|
||||
a.Restored = true
|
||||
ar.alerts[a.ID] = a
|
||||
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
|
||||
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// alertsToSend walks through the current alerts of AlertingRule
|
||||
// and returns only those which should be sent to notifier.
|
||||
// Isn't concurrent safe.
|
||||
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert {
|
||||
needsSending := func(a *notifier.Alert) bool {
|
||||
if a.State == notifier.StatePending {
|
||||
return false
|
||||
}
|
||||
if a.ResolvedAt.After(a.LastSent) {
|
||||
return true
|
||||
}
|
||||
return a.LastSent.Add(resendDelay).Before(ts)
|
||||
}
|
||||
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
if !needsSending(a) {
|
||||
continue
|
||||
}
|
||||
a.End = ts.Add(resolveDuration)
|
||||
if a.State == notifier.StateInactive {
|
||||
a.End = a.ResolvedAt
|
||||
}
|
||||
a.LastSent = ts
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -60,7 +61,7 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
@@ -75,7 +76,7 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
¬ifier.Alert{State: notifier.StatePending, ActiveAt: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
@@ -168,7 +169,7 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
@@ -176,7 +177,9 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
{},
|
||||
{},
|
||||
},
|
||||
nil,
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
@@ -216,8 +219,9 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
},
|
||||
// 1: fire first alert
|
||||
// 2: fire second alert, set first inactive
|
||||
// 3: fire third alert, set second inactive, delete first one
|
||||
// 3: fire third alert, set second inactive
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
@@ -300,7 +304,7 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO()); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now()); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
@@ -311,10 +315,13 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
}
|
||||
expAlerts := make(map[uint64]*notifier.Alert)
|
||||
for _, ta := range tc.expAlerts {
|
||||
labels := ta.labels
|
||||
labels = append(labels, alertNameLabel)
|
||||
labels = append(labels, tc.rule.Name)
|
||||
h := hash(metricWithLabels(t, labels...))
|
||||
labels := make(map[string]string)
|
||||
for i := 0; i < len(ta.labels); i += 2 {
|
||||
k, v := ta.labels[i], ta.labels[i+1]
|
||||
labels[k] = v
|
||||
}
|
||||
labels[alertNameLabel] = tc.rule.Name
|
||||
h := hash(labels)
|
||||
expAlerts[h] = ta.alert
|
||||
}
|
||||
for key, exp := range expAlerts {
|
||||
@@ -379,9 +386,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(3, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -390,9 +397,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -401,11 +408,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(5, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(20, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -417,15 +424,15 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
//
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0),
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0),
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
@@ -478,7 +485,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
a.Labels = make(map[string]string)
|
||||
}
|
||||
a.Labels[alertNameLabel] = tc.rule.Name
|
||||
expTS = append(expTS, tc.rule.alertToTimeSeries(tc.expAlerts[j], timestamp)...)
|
||||
expTS = append(expTS, tc.rule.alertToTimeSeries(a, timestamp)...)
|
||||
j++
|
||||
}
|
||||
}
|
||||
@@ -509,8 +516,8 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(datasource.Metric{}): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
hash(nil): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -525,13 +532,13 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t,
|
||||
alertNameLabel, "metric labels",
|
||||
alertGroupNameLabel, "groupID",
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
)): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
hash(map[string]string{
|
||||
alertNameLabel: "metric labels",
|
||||
alertGroupNameLabel: "groupID",
|
||||
"foo": "bar",
|
||||
"namespace": "baz",
|
||||
}): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -546,12 +553,12 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t,
|
||||
"foo", "bar",
|
||||
"namespace", "baz",
|
||||
"source", "vm",
|
||||
)): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
hash(map[string]string{
|
||||
"foo": "bar",
|
||||
"namespace": "baz",
|
||||
"source": "vm",
|
||||
}): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(time.Hour)},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -571,12 +578,12 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(time.Hour)},
|
||||
hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(2 * time.Hour)},
|
||||
hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
|
||||
Start: time.Now().Truncate(3 * time.Hour)},
|
||||
hash(map[string]string{"host": "localhost-1"}): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(time.Hour)},
|
||||
hash(map[string]string{"host": "localhost-2"}): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(2 * time.Hour)},
|
||||
hash(map[string]string{"host": "localhost-3"}): {State: notifier.StatePending,
|
||||
ActiveAt: time.Now().Truncate(3 * time.Hour)},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -601,8 +608,8 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
if got.State != exp.State {
|
||||
t.Fatalf("expected state %d; got %d", exp.State, got.State)
|
||||
}
|
||||
if got.Start != exp.Start {
|
||||
t.Fatalf("expected Start %v; got %v", exp.Start, got.Start)
|
||||
if got.ActiveAt != exp.ActiveAt {
|
||||
t.Fatalf("expected ActiveAt %v; got %v", exp.ActiveAt, got.ActiveAt)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -617,14 +624,14 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
|
||||
// successful attempt
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.Exec(context.TODO())
|
||||
_, err := ar.Exec(context.TODO(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// label `job` will collide with rule extra label and will make both time series equal
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.Exec(context.TODO())
|
||||
_, err = ar.Exec(context.TODO(), time.Now())
|
||||
if !errors.Is(err, errDuplicate) {
|
||||
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
|
||||
}
|
||||
@@ -633,7 +640,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
_, err = ar.Exec(context.TODO())
|
||||
_, err = ar.Exec(context.TODO(), time.Now())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -655,7 +662,7 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
metricWithValueAndLabels(t, 1, "instance", "bar"),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): {
|
||||
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): {
|
||||
Annotations: map[string]string{},
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "common",
|
||||
@@ -663,7 +670,7 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
"instance": "foo",
|
||||
},
|
||||
},
|
||||
hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): {
|
||||
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): {
|
||||
Annotations: map[string]string{},
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "common",
|
||||
@@ -678,77 +685,70 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
Name: "override label",
|
||||
Labels: map[string]string{
|
||||
"instance": "{{ $labels.instance }}",
|
||||
"region": "east",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`,
|
||||
"description": `It is {{ $value }} connections for "{{ $labels.instance }}"`,
|
||||
"summary": `Too high connection number for "{{ $labels.instance }}"`,
|
||||
"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
|
||||
},
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "instance", "foo"),
|
||||
metricWithValueAndLabels(t, 10, "instance", "bar"),
|
||||
metricWithValueAndLabels(t, 2, "instance", "foo", alertNameLabel, "override"),
|
||||
metricWithValueAndLabels(t, 10, "instance", "bar", alertNameLabel, "override"),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): {
|
||||
hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "override label",
|
||||
"instance": "foo",
|
||||
"region": "east",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Too high connection number for "foo" for region east`,
|
||||
"description": `It is 2 connections for "foo"`,
|
||||
"summary": `Too high connection number for "foo"`,
|
||||
"description": `override: It is 2 connections for "foo"`,
|
||||
},
|
||||
},
|
||||
hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): {
|
||||
hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "override label",
|
||||
"instance": "bar",
|
||||
"region": "east",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Too high connection number for "bar" for region east`,
|
||||
"description": `It is 10 connections for "bar"`,
|
||||
"summary": `Too high connection number for "bar"`,
|
||||
"description": `override: It is 10 connections for "bar"`,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
&AlertingRule{
|
||||
Name: "ExtraTemplating",
|
||||
Name: "OriginLabels",
|
||||
GroupName: "Testing",
|
||||
Labels: map[string]string{
|
||||
"name": "alert_{{ $labels.alertname }}",
|
||||
"group": "group_{{ $labels.alertgroup }}",
|
||||
"instance": "{{ $labels.instance }}",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
|
||||
"description": `Alert "{{ $labels.name }}({{ $labels.group }})" for instance {{ $labels.instance }}`,
|
||||
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
|
||||
},
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "instance", "foo"),
|
||||
metricWithValueAndLabels(t, 1,
|
||||
alertNameLabel, "originAlertname",
|
||||
alertGroupNameLabel, "originGroupname",
|
||||
"instance", "foo"),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, alertNameLabel, "ExtraTemplating",
|
||||
"name", "alert_ExtraTemplating",
|
||||
alertGroupNameLabel, "Testing",
|
||||
"group", "group_Testing",
|
||||
"instance", "foo")): {
|
||||
hash(map[string]string{
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"instance": "foo"}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "ExtraTemplating",
|
||||
"name": "alert_ExtraTemplating",
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"group": "group_Testing",
|
||||
"instance": "foo",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Alert "ExtraTemplating(Testing)" for instance foo`,
|
||||
"description": `Alert "alert_ExtraTemplating(group_Testing)" for instance foo`,
|
||||
"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -761,7 +761,7 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
tc.rule.q = fq
|
||||
fq.add(tc.metrics...)
|
||||
if _, err := tc.rule.Exec(context.TODO()); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now()); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
for hash, expAlert := range tc.expAlerts {
|
||||
@@ -781,6 +781,86 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertsToSend(t *testing.T) {
|
||||
ts := time.Now()
|
||||
f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) {
|
||||
t.Helper()
|
||||
ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)}
|
||||
for i, a := range alerts {
|
||||
ar.alerts[uint64(i)] = a
|
||||
}
|
||||
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay)
|
||||
if gotAlerts == nil && expAlerts == nil {
|
||||
return
|
||||
}
|
||||
if len(gotAlerts) != len(expAlerts) {
|
||||
t.Fatalf("expected to get %d alerts; got %d instead",
|
||||
len(expAlerts), len(gotAlerts))
|
||||
}
|
||||
sort.Slice(expAlerts, func(i, j int) bool {
|
||||
return expAlerts[i].Name < expAlerts[j].Name
|
||||
})
|
||||
sort.Slice(gotAlerts, func(i, j int) bool {
|
||||
return gotAlerts[i].Name < gotAlerts[j].Name
|
||||
})
|
||||
for i, exp := range expAlerts {
|
||||
got := gotAlerts[i]
|
||||
if got.LastSent != exp.LastSent {
|
||||
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent)
|
||||
}
|
||||
if got.End != exp.End {
|
||||
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
f( // send firing alert with custom resolve time
|
||||
[]*notifier.Alert{{State: notifier.StateFiring}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // resolve inactive alert at the current timestamp
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was already sent in the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
nil,
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was sent out of the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // alert must be sent even if resendDelay interval is 0
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, 0,
|
||||
)
|
||||
f( // inactive alert which has been sent already
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}},
|
||||
nil,
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // inactive alert which has been resolved after last send
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// Group contains list of Rules grouped into
|
||||
@@ -25,7 +26,7 @@ type Group struct {
|
||||
Type datasource.Type `yaml:"type,omitempty"`
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval utils.PromDuration `yaml:"interval"`
|
||||
Interval promutils.Duration `yaml:"interval"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
// ExtraFilterLabels is a list label filters applied to every rule
|
||||
@@ -129,7 +130,7 @@ type Rule struct {
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For utils.PromDuration `yaml:"for"`
|
||||
For promutils.Duration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
@@ -260,7 +260,7 @@ func TestGroup_Validate(t *testing.T) {
|
||||
Rules: []Rule{
|
||||
{
|
||||
Expr: "sumSeries(time('foo.bar',10))",
|
||||
For: utils.NewPromDuration(10 * time.Millisecond),
|
||||
For: promutils.NewDuration(10 * time.Millisecond),
|
||||
},
|
||||
{
|
||||
Expr: "sum(up == 0 ) by (host)",
|
||||
@@ -275,7 +275,7 @@ func TestGroup_Validate(t *testing.T) {
|
||||
Rules: []Rule{
|
||||
{
|
||||
Expr: "sum(up == 0 ) by (host)",
|
||||
For: utils.NewPromDuration(10 * time.Millisecond),
|
||||
For: promutils.NewDuration(10 * time.Millisecond),
|
||||
},
|
||||
{
|
||||
Expr: "sumSeries(time('foo.bar',10))",
|
||||
@@ -342,7 +342,7 @@ func TestHashRule(t *testing.T) {
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: utils.NewPromDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: promutils.NewDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
|
||||
@@ -25,6 +25,7 @@ groups:
|
||||
dynamic: '{{ $x := query "up" | first | value }}{{ if eq 1.0 $x }}one{{ else }}unknown{{ end }}'
|
||||
annotations:
|
||||
description: Job {{ $labels.job }} is up!
|
||||
external: cluster-{{ $externalLabels.cluster }}; replica-{{ $externalLabels.replica }}
|
||||
summary: All instances up {{ range query "up" }}
|
||||
{{ . | label "instance" }}
|
||||
{{ end }}
|
||||
|
||||
12
app/vmalert/config/testdata/rules_interval_good.rules
vendored
Normal file
12
app/vmalert/config/testdata/rules_interval_good.rules
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
groups:
|
||||
- name: groupTest
|
||||
interval: 1s
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 2s
|
||||
expr: sum(rate(vm_http_request_errors_total[2s])) > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
// Querier interface wraps Query and QueryRange methods
|
||||
type Querier interface {
|
||||
Query(ctx context.Context, query string) ([]Metric, error)
|
||||
Query(ctx context.Context, query string, ts time.Time) ([]Metric, error)
|
||||
QueryRange(ctx context.Context, query string, from, to time.Time) ([]Metric, error)
|
||||
}
|
||||
|
||||
|
||||
@@ -13,12 +13,14 @@ import (
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "VictoriaMetrics or vmselect url. Required parameter. "+
|
||||
"E.g. http://127.0.0.1:8428")
|
||||
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
|
||||
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
|
||||
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
basicAuthPasswordFile = flag.String("datasource.basicAuth.passwordFile", "", "Optional path to basic auth password to use for -datasource.url")
|
||||
bearerToken = flag.String("datasource.bearerToken", "", "Optional bearer auth token to use for -datasource.url.")
|
||||
bearerTokenFile = flag.String("datasource.bearerTokenFile", "", "Optional path to bearer token file to use for -datasource.url.")
|
||||
|
||||
bearerToken = flag.String("datasource.bearerToken", "", "Optional bearer auth token to use for -datasource.url.")
|
||||
bearerTokenFile = flag.String("datasource.bearerTokenFile", "", "Optional path to bearer token file to use for -datasource.url.")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
|
||||
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
|
||||
@@ -26,12 +28,22 @@ var (
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", `Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used`)
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", `Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used`)
|
||||
|
||||
oauth2ClientID = flag.String("datasource.oauth2.clientID", "", "Optional OAuth2 clientID to use for -datasource.url. ")
|
||||
oauth2ClientSecret = flag.String("datasource.oauth2.clientSecret", "", "Optional OAuth2 clientSecret to use for -datasource.url.")
|
||||
oauth2ClientSecretFile = flag.String("datasource.oauth2.clientSecretFile", "", "Optional OAuth2 clientSecretFile to use for -datasource.url. ")
|
||||
oauth2TokenURL = flag.String("datasource.oauth2.tokenUrl", "", "Optional OAuth2 tokenURL to use for -datasource.url.")
|
||||
oauth2Scopes = flag.String("datasource.oauth2.scopes", "", "Optional OAuth2 scopes to use for -datasource.url. Scopes must be delimited by ';'")
|
||||
|
||||
lookBack = flag.Duration("datasource.lookback", 0, `Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
|
||||
queryStep = flag.Duration("datasource.queryStep", 0, "queryStep defines how far a value can fallback to when evaluating queries. "+
|
||||
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query."+
|
||||
"If queryStep isn't specified, rule's evaluationInterval will be used instead.")
|
||||
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Whether to align "time" parameter with evaluation interval.`+
|
||||
"Alignment supposed to produce deterministic results despite of number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
|
||||
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
|
||||
roundDigits = flag.Int("datasource.roundDigits", 0, `Adds "round_digits" GET param to datasource requests. `+
|
||||
disableKeepAlive = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
|
||||
`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)
|
||||
roundDigits = flag.Int("datasource.roundDigits", 0, `Adds "round_digits" GET param to datasource requests. `+
|
||||
`In VM "round_digits" limits the number of digits after the decimal point in response values.`)
|
||||
)
|
||||
|
||||
@@ -52,6 +64,7 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
tr.DisableKeepAlives = *disableKeepAlive
|
||||
tr.MaxIdleConnsPerHost = *maxIdleConnections
|
||||
if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost {
|
||||
tr.MaxIdleConns = tr.MaxIdleConnsPerHost
|
||||
@@ -64,7 +77,10 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
|
||||
extraParams.Set("round_digits", fmt.Sprintf("%d", *roundDigits))
|
||||
}
|
||||
|
||||
authCfg, err := utils.AuthConfig(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile, *bearerToken, *bearerTokenFile)
|
||||
authCfg, err := utils.AuthConfig(
|
||||
utils.WithBasicAuth(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile),
|
||||
utils.WithBearer(*bearerToken, *bearerTokenFile),
|
||||
utils.WithOAuth(*oauth2ClientID, *oauth2ClientSecret, *oauth2ClientSecretFile, *oauth2TokenURL, *oauth2Scopes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
|
||||
@@ -71,13 +71,12 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati
|
||||
}
|
||||
|
||||
// Query executes the given query and returns parsed response
|
||||
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
|
||||
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) ([]Metric, error) {
|
||||
req, err := s.newRequestPOST()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts := time.Now()
|
||||
switch s.dataSourceType.String() {
|
||||
case "prometheus":
|
||||
s.setPrometheusInstantReqParams(req, query, ts)
|
||||
|
||||
@@ -125,7 +125,7 @@ func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string,
|
||||
if s.lookBack > 0 {
|
||||
timestamp = timestamp.Add(-s.lookBack)
|
||||
}
|
||||
if s.evaluationInterval > 0 {
|
||||
if *queryTimeAlignment && s.evaluationInterval > 0 {
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
|
||||
timestamp = timestamp.Truncate(s.evaluationInterval)
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
)
|
||||
|
||||
@@ -88,26 +89,27 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
|
||||
p := NewPrometheusType()
|
||||
pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second})
|
||||
ts := time.Now()
|
||||
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected connection error got nil")
|
||||
}
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected invalid response status error got nil")
|
||||
}
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected response body error got nil")
|
||||
}
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected error status got nil")
|
||||
}
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected unknown status got nil")
|
||||
}
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
if _, err := pq.Query(ctx, query, ts); err == nil {
|
||||
t.Fatalf("expected non-vector resultType error got nil")
|
||||
}
|
||||
m, err := pq.Query(ctx, query)
|
||||
m, err := pq.Query(ctx, query, ts)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -133,7 +135,7 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
g := NewGraphiteType()
|
||||
gq := s.BuildWithParams(QuerierParams{DataSourceType: &g})
|
||||
|
||||
m, err = gq.Query(ctx, queryRender)
|
||||
m, err = gq.Query(ctx, queryRender, ts)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -517,6 +519,59 @@ func TestRequestParams(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthConfig(t *testing.T) {
|
||||
var testCases = []struct {
|
||||
name string
|
||||
vmFn func() *VMStorage
|
||||
checkFn func(t *testing.T, r *http.Request)
|
||||
}{
|
||||
{
|
||||
name: "basic auth",
|
||||
vmFn: func() *VMStorage {
|
||||
cfg, err := utils.AuthConfig(utils.WithBasicAuth("foo", "bar", ""))
|
||||
if err != nil {
|
||||
t.Errorf("Error get auth config: %s", err)
|
||||
}
|
||||
return &VMStorage{authCfg: cfg}
|
||||
},
|
||||
checkFn: func(t *testing.T, r *http.Request) {
|
||||
u, p, _ := r.BasicAuth()
|
||||
checkEqualString(t, "foo", u)
|
||||
checkEqualString(t, "bar", p)
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "bearer auth",
|
||||
vmFn: func() *VMStorage {
|
||||
cfg, err := utils.AuthConfig(utils.WithBearer("foo", ""))
|
||||
if err != nil {
|
||||
t.Errorf("Error get auth config: %s", err)
|
||||
}
|
||||
return &VMStorage{authCfg: cfg}
|
||||
},
|
||||
checkFn: func(t *testing.T, r *http.Request) {
|
||||
reqToken := r.Header.Get("Authorization")
|
||||
splitToken := strings.Split(reqToken, "Bearer ")
|
||||
if len(splitToken) != 2 {
|
||||
t.Errorf("expected two items got %d", len(splitToken))
|
||||
}
|
||||
token := splitToken[1]
|
||||
checkEqualString(t, "foo", token)
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tt := range testCases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
vm := tt.vmFn()
|
||||
req, err := vm.newRequestPOST()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
tt.checkFn(t, req)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func checkEqualString(t *testing.T, exp, got string) {
|
||||
t.Helper()
|
||||
if got != exp {
|
||||
|
||||
@@ -5,6 +5,8 @@ import (
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -13,20 +15,23 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type datasource.Type
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
Checksum string
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type datasource.Type
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
Checksum string
|
||||
LastEvaluation time.Time
|
||||
|
||||
Labels map[string]string
|
||||
Params url.Values
|
||||
@@ -41,15 +46,17 @@ type Group struct {
|
||||
}
|
||||
|
||||
type groupMetrics struct {
|
||||
iterationTotal *counter
|
||||
iterationDuration *summary
|
||||
iterationTotal *utils.Counter
|
||||
iterationDuration *utils.Summary
|
||||
iterationMissed *utils.Counter
|
||||
}
|
||||
|
||||
func newGroupMetrics(name, file string) *groupMetrics {
|
||||
m := &groupMetrics{}
|
||||
labels := fmt.Sprintf(`group=%q, file=%q`, name, file)
|
||||
m.iterationTotal = getOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels))
|
||||
m.iterationDuration = getOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels))
|
||||
m.iterationTotal = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels))
|
||||
m.iterationDuration = utils.GetOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels))
|
||||
m.iterationMissed = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_iteration_missed_total{%s}`, labels))
|
||||
return m
|
||||
}
|
||||
|
||||
@@ -122,7 +129,7 @@ func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
// rules file and group name
|
||||
// rules file and group Name
|
||||
func (g *Group) ID() uint64 {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
@@ -213,8 +220,8 @@ func (g *Group) close() {
|
||||
close(g.doneCh)
|
||||
<-g.finishedCh
|
||||
|
||||
metrics.UnregisterMetric(g.metrics.iterationDuration.name)
|
||||
metrics.UnregisterMetric(g.metrics.iterationTotal.name)
|
||||
g.metrics.iterationDuration.Unregister()
|
||||
g.metrics.iterationTotal.Unregister()
|
||||
for _, rule := range g.Rules {
|
||||
rule.Close()
|
||||
}
|
||||
@@ -222,9 +229,16 @@ func (g *Group) close() {
|
||||
|
||||
var skipRandSleepOnGroupStart bool
|
||||
|
||||
func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: nts,
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label)}
|
||||
|
||||
evalTS := time.Now()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
@@ -246,16 +260,31 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||
}
|
||||
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{rw: rw}
|
||||
for _, nt := range nts {
|
||||
ent := eNotifier{
|
||||
Notifier: nt,
|
||||
alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
|
||||
alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
|
||||
|
||||
eval := func(ts time.Time) {
|
||||
g.metrics.iterationTotal.Inc()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
if len(g.Rules) < 1 {
|
||||
g.metrics.iterationDuration.UpdateDuration(start)
|
||||
g.LastEvaluation = start
|
||||
return
|
||||
}
|
||||
e.notifiers = append(e.notifiers, ent)
|
||||
|
||||
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
|
||||
errs := e.execConcurrently(ctx, g.Rules, ts, g.Concurrency, resolveDuration)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
}
|
||||
}
|
||||
g.metrics.iterationDuration.UpdateDuration(start)
|
||||
g.LastEvaluation = start
|
||||
}
|
||||
|
||||
eval(evalTS)
|
||||
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
@@ -282,50 +311,48 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||
g.mu.Unlock()
|
||||
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
case <-t.C:
|
||||
g.metrics.iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
if len(g.Rules) > 0 {
|
||||
resolveDuration := getResolveDuration(g.Interval)
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
}
|
||||
}
|
||||
missed := (time.Since(evalTS) / g.Interval) - 1
|
||||
if missed > 0 {
|
||||
g.metrics.iterationMissed.Inc()
|
||||
}
|
||||
g.metrics.iterationDuration.UpdateDuration(iterationStart)
|
||||
evalTS = evalTS.Add((missed + 1) * g.Interval)
|
||||
|
||||
eval(evalTS)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// resolveDuration for alerts is equal to 3 interval evaluations
|
||||
// so in case if vmalert stops sending updates for some reason,
|
||||
// notifier could automatically resolve the alert.
|
||||
func getResolveDuration(groupInterval time.Duration) time.Duration {
|
||||
resolveInterval := groupInterval * 3
|
||||
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
|
||||
return *maxResolveDuration
|
||||
// getResolveDuration returns the duration after which firing alert
|
||||
// can be considered as resolved.
|
||||
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
|
||||
if groupInterval > delta {
|
||||
delta = groupInterval
|
||||
}
|
||||
return resolveInterval
|
||||
resolveDuration := delta * 4
|
||||
if maxDuration > 0 && resolveDuration > maxDuration {
|
||||
resolveDuration = maxDuration
|
||||
}
|
||||
return resolveDuration
|
||||
}
|
||||
|
||||
type executor struct {
|
||||
notifiers []eNotifier
|
||||
notifiers func() []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
|
||||
previouslySentSeriesToRWMu sync.Mutex
|
||||
// previouslySentSeriesToRW stores series sent to RW on previous iteration
|
||||
// map[ruleID]map[ruleLabels][]prompb.Label
|
||||
// where `ruleID` is ID of the Rule within a Group
|
||||
// and `ruleLabels` is []prompb.Label marshalled to a string
|
||||
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
|
||||
}
|
||||
|
||||
type eNotifier struct {
|
||||
notifier.Notifier
|
||||
alertsSent *counter
|
||||
alertsSendErrors *counter
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, resolveDuration time.Duration) chan error {
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
if concurrency == 1 {
|
||||
// fast path
|
||||
for _, rule := range rules {
|
||||
res <- e.exec(ctx, rule, resolveDuration)
|
||||
res <- e.exec(ctx, rule, ts, resolveDuration)
|
||||
}
|
||||
close(res)
|
||||
return res
|
||||
@@ -338,7 +365,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, resolveDuration)
|
||||
res <- e.exec(ctx, r, ts, resolveDuration)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
@@ -356,54 +383,95 @@ var (
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
||||
)
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Duration) error {
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration) error {
|
||||
execTotal.Inc()
|
||||
|
||||
tss, err := rule.Exec(ctx)
|
||||
tss, err := rule.Exec(ctx, ts)
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
}
|
||||
|
||||
if len(tss) > 0 && e.rw != nil {
|
||||
for _, ts := range tss {
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
errGr := new(utils.ErrGroup)
|
||||
if e.rw != nil {
|
||||
pushToRW := func(tss []prompbmarshal.TimeSeries) {
|
||||
for _, ts := range tss {
|
||||
remoteWriteTotal.Inc()
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: remote write failure: %w", rule, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
pushToRW(tss)
|
||||
staleSeries := e.getStaleSeries(rule, tss, ts)
|
||||
pushToRW(staleSeries)
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
a.End = time.Now().Add(resolveDuration)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = time.Now()
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
|
||||
alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay)
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers {
|
||||
nt.alertsSent.Add(len(alerts))
|
||||
for _, nt := range e.notifiers() {
|
||||
if err := nt.Send(ctx, alerts); err != nil {
|
||||
nt.alertsSendErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
|
||||
}
|
||||
}
|
||||
return errGr.Err()
|
||||
}
|
||||
|
||||
// getStaledSeries checks whether there are stale series from previously sent ones.
|
||||
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
|
||||
for _, ts := range tss {
|
||||
// convert labels to strings so we can compare with previously sent series
|
||||
key := labelsToString(ts.Labels)
|
||||
ruleLabels[key] = ts.Labels
|
||||
}
|
||||
|
||||
rID := rule.ID()
|
||||
var staleS []prompbmarshal.TimeSeries
|
||||
// check whether there are series which disappeared and need to be marked as stale
|
||||
e.previouslySentSeriesToRWMu.Lock()
|
||||
for key, labels := range e.previouslySentSeriesToRW[rID] {
|
||||
if _, ok := ruleLabels[key]; ok {
|
||||
continue
|
||||
}
|
||||
// previously sent series are missing in current series, so we mark them as stale
|
||||
ss := newTimeSeriesPB([]float64{decimal.StaleNaN}, []int64{timestamp.Unix()}, labels)
|
||||
staleS = append(staleS, ss)
|
||||
}
|
||||
// set previous series to current
|
||||
e.previouslySentSeriesToRW[rID] = ruleLabels
|
||||
e.previouslySentSeriesToRWMu.Unlock()
|
||||
|
||||
return staleS
|
||||
}
|
||||
|
||||
func labelsToString(labels []prompbmarshal.Label) string {
|
||||
var b strings.Builder
|
||||
b.WriteRune('{')
|
||||
for i, label := range labels {
|
||||
if len(label.Name) == 0 {
|
||||
b.WriteString("__name__")
|
||||
} else {
|
||||
b.WriteString(label.Name)
|
||||
}
|
||||
b.WriteRune('=')
|
||||
b.WriteString(strconv.Quote(label.Value))
|
||||
if i < len(labels)-1 {
|
||||
b.WriteRune(',')
|
||||
}
|
||||
}
|
||||
b.WriteRune('}')
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -3,13 +3,16 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -34,7 +37,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: utils.NewPromDuration(time.Second),
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
@@ -46,7 +49,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: utils.NewPromDuration(time.Second),
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
@@ -158,10 +161,11 @@ func TestGroupStart(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
|
||||
fs := &fakeQuerier{}
|
||||
fn := &fakeNotifier{}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
@@ -170,7 +174,7 @@ func TestGroupStart(t *testing.T) {
|
||||
m2 := metricWithLabels(t, "instance", inst2, "job", job)
|
||||
|
||||
r := g.Rules[0].(*AlertingRule)
|
||||
alert1, err := r.newAlert(m1, time.Now(), nil)
|
||||
alert1, err := r.newAlert(m1, nil, time.Now(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
@@ -183,13 +187,9 @@ func TestGroupStart(t *testing.T) {
|
||||
// add service labels
|
||||
alert1.Labels[alertNameLabel] = alert1.Name
|
||||
alert1.Labels[alertGroupNameLabel] = g.Name
|
||||
var labels1 []string
|
||||
for k, v := range alert1.Labels {
|
||||
labels1 = append(labels1, k, v)
|
||||
}
|
||||
alert1.ID = hash(metricWithLabels(t, labels1...))
|
||||
alert1.ID = hash(alert1.Labels)
|
||||
|
||||
alert2, err := r.newAlert(m2, time.Now(), nil)
|
||||
alert2, err := r.newAlert(m2, nil, time.Now(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("faield to create alert: %s", err)
|
||||
}
|
||||
@@ -202,17 +202,13 @@ func TestGroupStart(t *testing.T) {
|
||||
// add service labels
|
||||
alert2.Labels[alertNameLabel] = alert2.Name
|
||||
alert2.Labels[alertGroupNameLabel] = g.Name
|
||||
var labels2 []string
|
||||
for k, v := range alert2.Labels {
|
||||
labels2 = append(labels2, k, v)
|
||||
}
|
||||
alert2.ID = hash(metricWithLabels(t, labels2...))
|
||||
alert2.ID = hash(alert2.Labels)
|
||||
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), []notifier.Notifier{fn}, nil)
|
||||
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
@@ -223,6 +219,12 @@ func TestGroupStart(t *testing.T) {
|
||||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
gotAlertsNum := fn.getCounter()
|
||||
if gotAlertsNum < len(expectedAlerts)*2 {
|
||||
t.Fatalf("expected to receive at least %d alerts; got %d instead",
|
||||
len(expectedAlerts)*2, gotAlertsNum)
|
||||
}
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
// and set only one datapoint for response
|
||||
@@ -232,7 +234,8 @@ func TestGroupStart(t *testing.T) {
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts = fn.getAlerts()
|
||||
expectedAlerts = []notifier.Alert{*alert1}
|
||||
alert2.State = notifier.StateInactive
|
||||
expectedAlerts = []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
g.close()
|
||||
@@ -243,22 +246,112 @@ func TestResolveDuration(t *testing.T) {
|
||||
testCases := []struct {
|
||||
groupInterval time.Duration
|
||||
maxDuration time.Duration
|
||||
resendDelay time.Duration
|
||||
expected time.Duration
|
||||
}{
|
||||
{time.Minute, 0, 3 * time.Minute},
|
||||
{3 * time.Minute, 0, 9 * time.Minute},
|
||||
{time.Minute, 2 * time.Minute, 2 * time.Minute},
|
||||
{0, 0, 0},
|
||||
{time.Minute, 0, 0, 4 * time.Minute},
|
||||
{time.Minute, 0, 2 * time.Minute, 8 * time.Minute},
|
||||
{time.Minute, 4 * time.Minute, 4 * time.Minute, 4 * time.Minute},
|
||||
{2 * time.Minute, time.Minute, 2 * time.Minute, time.Minute},
|
||||
{time.Minute, 2 * time.Minute, 1 * time.Minute, 2 * time.Minute},
|
||||
{2 * time.Minute, 0, 1 * time.Minute, 8 * time.Minute},
|
||||
{0, 0, 0, 0},
|
||||
}
|
||||
defaultResolveDuration := *maxResolveDuration
|
||||
defer func() { *maxResolveDuration = defaultResolveDuration }()
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) {
|
||||
*maxResolveDuration = tc.maxDuration
|
||||
got := getResolveDuration(tc.groupInterval)
|
||||
got := getResolveDuration(tc.groupInterval, tc.resendDelay, tc.maxDuration)
|
||||
if got != tc.expected {
|
||||
t.Errorf("expected to have %v; got %v", tc.expected, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetStaleSeries(t *testing.T) {
|
||||
ts := time.Now()
|
||||
e := &executor{
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
|
||||
t.Helper()
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, l := range labels {
|
||||
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
|
||||
}
|
||||
staleS := e.getStaleSeries(rule, tss, ts)
|
||||
if staleS == nil && expLabels == nil {
|
||||
return
|
||||
}
|
||||
if len(staleS) != len(expLabels) {
|
||||
t.Fatalf("expected to get %d stale series, got %d",
|
||||
len(expLabels), len(staleS))
|
||||
}
|
||||
for i, exp := range expLabels {
|
||||
got := staleS[i]
|
||||
if !reflect.DeepEqual(exp, got.Labels) {
|
||||
t.Fatalf("expected to get labels: \n%v;\ngot instead: \n%v",
|
||||
exp, got.Labels)
|
||||
}
|
||||
if len(got.Samples) != 1 {
|
||||
t.Fatalf("expected to have 1 sample; got %d", len(got.Samples))
|
||||
}
|
||||
if !decimal.IsStaleNaN(got.Samples[0].Value) {
|
||||
t.Fatalf("expected sample value to be %v; got %v", decimal.StaleNaN, got.Samples[0].Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// warn: keep in mind, that executor holds the state, so sequence of f calls matters
|
||||
|
||||
// single series
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "foo")},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "foo")},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
nil,
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "foo")})
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
nil,
|
||||
nil)
|
||||
|
||||
// multiple series
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "foo"),
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "bar"),
|
||||
},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "bar")},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "foo")})
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "bar")},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
nil,
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "bar")})
|
||||
|
||||
// multiple rules and series
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "foo"),
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "bar"),
|
||||
},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 2},
|
||||
[][]prompbmarshal.Label{
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "foo"),
|
||||
toPromLabels(t, "__name__", "job:foo", "job", "bar"),
|
||||
},
|
||||
nil)
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "bar")},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "foo")})
|
||||
f(&AlertingRule{RuleID: 1},
|
||||
[][]prompbmarshal.Label{toPromLabels(t, "__name__", "job:foo", "job", "bar")},
|
||||
nil)
|
||||
}
|
||||
|
||||
@@ -44,10 +44,10 @@ func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Qu
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) ([]datasource.Metric, error) {
|
||||
return fq.Query(ctx, q)
|
||||
return fq.Query(ctx, q, time.Now())
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
@@ -61,16 +61,26 @@ func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric,
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
// records number of received alerts in total
|
||||
counter int
|
||||
}
|
||||
|
||||
func (*fakeNotifier) Close() {}
|
||||
func (*fakeNotifier) Addr() string { return "" }
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getCounter() int {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.counter
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
@@ -106,6 +116,21 @@ func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
return m
|
||||
}
|
||||
|
||||
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
var ls []prompbmarshal.Label
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
ls = append(ls, prompbmarshal.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return ls
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
|
||||
@@ -35,7 +35,10 @@ absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.`)
|
||||
|
||||
rulesCheckInterval = flag.Duration("rule.configCheckInterval", 0, "Interval for checking for changes in '-rule' files. "+
|
||||
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes")
|
||||
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead")
|
||||
|
||||
configCheckInterval = flag.Duration("configCheckInterval", 0, "Interval for checking for changes in '-rule' or '-notifier.config' files. "+
|
||||
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes.")
|
||||
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
@@ -44,17 +47,19 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
|
||||
"which is by default equal to 3 evaluation intervals of the parent group.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
|
||||
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
|
||||
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
|
||||
"Pass multiple -label flags in order to add multiple label sets.")
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.")
|
||||
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's name as label to generated alerts and time series.")
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
|
||||
)
|
||||
@@ -97,6 +102,9 @@ func main() {
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init remoteWrite: %s", err)
|
||||
}
|
||||
if rw == nil {
|
||||
logger.Fatalf("remoteWrite.url can't be empty in replay mode")
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
|
||||
if err != nil {
|
||||
@@ -161,7 +169,20 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init datasource: %w", err)
|
||||
}
|
||||
nts, err := notifier.Init(alertURLGeneratorFn)
|
||||
|
||||
labels := make(map[string]string, 0)
|
||||
for _, s := range *externalLabels {
|
||||
if len(s) == 0 {
|
||||
continue
|
||||
}
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `Name=value`; got %q", s)
|
||||
}
|
||||
labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
|
||||
nts, err := notifier.Init(alertURLGeneratorFn, labels, *externalURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
@@ -169,7 +190,7 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: q,
|
||||
notifiers: nts,
|
||||
labels: map[string]string{},
|
||||
labels: labels,
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
@@ -183,16 +204,6 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
}
|
||||
manager.rr = rr
|
||||
|
||||
for _, s := range *externalLabels {
|
||||
if len(s) == 0 {
|
||||
continue
|
||||
}
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
manager.labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
@@ -232,7 +243,7 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
|
||||
"tpl": externalAlertSource,
|
||||
}
|
||||
return func(alert notifier.Alert) string {
|
||||
templated, err := alert.ExecTemplate(nil, m)
|
||||
templated, err := alert.ExecTemplate(nil, nil, m)
|
||||
if err != nil {
|
||||
logger.Errorf("can not exec source template %s", err)
|
||||
}
|
||||
@@ -251,8 +262,13 @@ See the docs at https://docs.victoriametrics.com/vmalert.html .
|
||||
|
||||
func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sighupCh <-chan os.Signal) {
|
||||
var configCheckCh <-chan time.Time
|
||||
if *rulesCheckInterval > 0 {
|
||||
ticker := time.NewTicker(*rulesCheckInterval)
|
||||
checkInterval := *configCheckInterval
|
||||
if checkInterval == 0 && *rulesCheckInterval > 0 {
|
||||
logger.Warnf("flag `rule.configCheckInterval` is deprecated - use `configCheckInterval` instead")
|
||||
checkInterval = *rulesCheckInterval
|
||||
}
|
||||
if checkInterval > 0 {
|
||||
ticker := time.NewTicker(checkInterval)
|
||||
configCheckCh = ticker.C
|
||||
defer ticker.Stop()
|
||||
}
|
||||
@@ -269,6 +285,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||
configReloads.Inc()
|
||||
case <-configCheckCh:
|
||||
}
|
||||
if err := notifier.Reload(); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("failed to reload notifier config: %s", err)
|
||||
continue
|
||||
}
|
||||
newGroupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
|
||||
if err != nil {
|
||||
configReloadErrors.Inc()
|
||||
|
||||
@@ -100,7 +100,7 @@ groups:
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*Group),
|
||||
labels: map[string]string{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
rw: &remotewrite.Client{},
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ import (
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers []notifier.Notifier
|
||||
notifiers func() []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
// remote read builder.
|
||||
@@ -37,7 +37,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("can't find group with id %q", gID)
|
||||
return nil, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
@@ -48,7 +48,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
|
||||
@@ -109,7 +109,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
return fmt.Errorf("config contains recording rules but `-remoteWrite.url` isn't set")
|
||||
}
|
||||
if arPresent && m.notifiers == nil {
|
||||
return fmt.Errorf("config contains alerting rules but `-notifier.url` isn't set")
|
||||
return fmt.Errorf("config contains alerting rules but neither `-notifier.url` nor `-notifier.config` aren't set")
|
||||
}
|
||||
|
||||
type updateItem struct {
|
||||
@@ -163,21 +163,17 @@ func (g *Group) toAPI() APIGroup {
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
Params: urlValuesToStrings(g.Params),
|
||||
Labels: g.Labels,
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.Seconds(),
|
||||
LastEvaluation: g.LastEvaluation,
|
||||
Concurrency: g.Concurrency,
|
||||
Params: urlValuesToStrings(g.Params),
|
||||
Labels: g.Labels,
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
case *AlertingRule:
|
||||
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
||||
case *RecordingRule:
|
||||
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
||||
}
|
||||
ag.Rules = append(ag.Rules, r.ToAPI())
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
@@ -223,7 +223,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
}
|
||||
|
||||
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
|
||||
@@ -311,9 +311,11 @@ func TestManagerUpdateNegative(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: tc.notifiers,
|
||||
rw: tc.rw,
|
||||
}
|
||||
if tc.notifiers != nil {
|
||||
m.notifiers = func() []notifier.Notifier { return tc.notifiers }
|
||||
}
|
||||
err := m.update(context.Background(), []config.Group{tc.cfg}, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get error; got nil")
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
package main
|
||||
|
||||
import "github.com/VictoriaMetrics/metrics"
|
||||
|
||||
type gauge struct {
|
||||
name string
|
||||
*metrics.Gauge
|
||||
}
|
||||
|
||||
func getOrCreateGauge(name string, f func() float64) *gauge {
|
||||
return &gauge{
|
||||
name: name,
|
||||
Gauge: metrics.GetOrCreateGauge(name, f),
|
||||
}
|
||||
}
|
||||
|
||||
type counter struct {
|
||||
name string
|
||||
*metrics.Counter
|
||||
}
|
||||
|
||||
func getOrCreateCounter(name string) *counter {
|
||||
return &counter{
|
||||
name: name,
|
||||
Counter: metrics.GetOrCreateCounter(name),
|
||||
}
|
||||
}
|
||||
|
||||
type summary struct {
|
||||
name string
|
||||
*metrics.Summary
|
||||
}
|
||||
|
||||
func getOrCreateSummary(name string) *summary {
|
||||
return &summary{
|
||||
name: name,
|
||||
Summary: metrics.GetOrCreateSummary(name),
|
||||
}
|
||||
}
|
||||
@@ -26,10 +26,16 @@ type Alert struct {
|
||||
State AlertState
|
||||
// Expr contains expression that was executed to generate the Alert
|
||||
Expr string
|
||||
// Start defines the moment of time when Alert has triggered
|
||||
// ActiveAt defines the moment of time when Alert has become active
|
||||
ActiveAt time.Time
|
||||
// Start defines the moment of time when Alert has become firing
|
||||
Start time.Time
|
||||
// End defines the moment of time when Alert supposed to expire
|
||||
End time.Time
|
||||
// ResolvedAt defines the moment when Alert was switched from Firing to Inactive
|
||||
ResolvedAt time.Time
|
||||
// LastSent defines the moment when Alert was sent last time
|
||||
LastSent time.Time
|
||||
// Value stores the value returned from evaluating expression from Expr field
|
||||
Value float64
|
||||
// ID is the unique identifer for the Alert
|
||||
@@ -70,14 +76,20 @@ type AlertTplData struct {
|
||||
Expr string
|
||||
}
|
||||
|
||||
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
|
||||
var tplHeaders = []string{
|
||||
"{{ $value := .Value }}",
|
||||
"{{ $labels := .Labels }}",
|
||||
"{{ $expr := .Expr }}",
|
||||
"{{ $externalLabels := .ExternalLabels }}",
|
||||
"{{ $externalURL := .ExternalURL }}",
|
||||
}
|
||||
|
||||
// ExecTemplate executes the Alert template for given
|
||||
// map of annotations.
|
||||
// Every alert could have a different datasource, so function
|
||||
// requires a queryFunction as an argument.
|
||||
func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) {
|
||||
tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
|
||||
func (a *Alert) ExecTemplate(q QueryFn, labels, annotations map[string]string) (map[string]string, error) {
|
||||
tplData := AlertTplData{Value: a.Value, Labels: labels, Expr: a.Expr}
|
||||
return templateAnnotations(annotations, tplData, funcsWithQuery(q))
|
||||
}
|
||||
|
||||
@@ -100,13 +112,15 @@ func templateAnnotations(annotations map[string]string, data AlertTplData, funcs
|
||||
var buf bytes.Buffer
|
||||
eg := new(utils.ErrGroup)
|
||||
r := make(map[string]string, len(annotations))
|
||||
tData := tplData{data, externalLabels, externalURL}
|
||||
header := strings.Join(tplHeaders, "")
|
||||
for key, text := range annotations {
|
||||
buf.Reset()
|
||||
builder.Reset()
|
||||
builder.Grow(len(tplHeader) + len(text))
|
||||
builder.WriteString(tplHeader)
|
||||
builder.Grow(len(header) + len(text))
|
||||
builder.WriteString(header)
|
||||
builder.WriteString(text)
|
||||
if err := templateAnnotation(&buf, builder.String(), data, funcs); err != nil {
|
||||
if err := templateAnnotation(&buf, builder.String(), tData, funcs); err != nil {
|
||||
r[key] = text
|
||||
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
|
||||
continue
|
||||
@@ -116,7 +130,13 @@ func templateAnnotations(annotations map[string]string, data AlertTplData, funcs
|
||||
return r, eg.Err()
|
||||
}
|
||||
|
||||
func templateAnnotation(dst io.Writer, text string, data AlertTplData, funcs template.FuncMap) error {
|
||||
type tplData struct {
|
||||
AlertTplData
|
||||
ExternalLabels map[string]string
|
||||
ExternalURL string
|
||||
}
|
||||
|
||||
func templateAnnotation(dst io.Writer, text string, data tplData, funcs template.FuncMap) error {
|
||||
t := template.New("").Funcs(funcs).Option("missingkey=zero")
|
||||
tpl, err := t.Parse(text)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,12 +1,24 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
func TestAlert_ExecTemplate(t *testing.T) {
|
||||
extLabels := make(map[string]string, 0)
|
||||
const (
|
||||
extCluster = "prod"
|
||||
extDC = "east"
|
||||
extURL = "https://foo.bar"
|
||||
)
|
||||
extLabels["cluster"] = extCluster
|
||||
extLabels["dc"] = extDC
|
||||
_, err := Init(nil, extLabels, extURL)
|
||||
checkErr(t, err)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
alert *Alert
|
||||
@@ -74,6 +86,26 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
"desc": "bar 1;garply 2;",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "external",
|
||||
alert: &Alert{
|
||||
Value: 1e4,
|
||||
Labels: map[string]string{
|
||||
"job": "staging",
|
||||
"instance": "localhost",
|
||||
},
|
||||
},
|
||||
annotations: map[string]string{
|
||||
"url": "{{ $externalURL }}",
|
||||
"summary": "Issues with {{$labels.instance}} (dc-{{$externalLabels.dc}}) for job {{$labels.job}}",
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}} (cluster-{{$externalLabels.cluster}})",
|
||||
},
|
||||
expTpl: map[string]string{
|
||||
"url": extURL,
|
||||
"summary": fmt.Sprintf("Issues with localhost (dc-%s) for job staging", extDC),
|
||||
"description": fmt.Sprintf("It is 10000 connections for localhost (cluster-%s)", extCluster),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
qFn := func(q string) ([]datasource.Metric, error) {
|
||||
@@ -98,7 +130,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations)
|
||||
tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -7,17 +7,41 @@ import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
)
|
||||
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
addr string
|
||||
alertURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
addr string
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
timeout time.Duration
|
||||
|
||||
authCfg *promauth.Config
|
||||
|
||||
metrics *metrics
|
||||
}
|
||||
|
||||
type metrics struct {
|
||||
alertsSent *utils.Counter
|
||||
alertsSendErrors *utils.Counter
|
||||
}
|
||||
|
||||
func newMetrics(addr string) *metrics {
|
||||
return &metrics{
|
||||
alertsSent: utils.GetOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", addr)),
|
||||
alertsSendErrors: utils.GetOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", addr)),
|
||||
}
|
||||
}
|
||||
|
||||
// Close is a destructor method for AlertManager
|
||||
func (am *AlertManager) Close() {
|
||||
am.metrics.alertsSent.Unregister()
|
||||
am.metrics.alertsSendErrors.Unregister()
|
||||
}
|
||||
|
||||
// Addr returns address where alerts are sent.
|
||||
@@ -25,17 +49,36 @@ func (am AlertManager) Addr() string { return am.addr }
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
am.metrics.alertsSent.Add(len(alerts))
|
||||
err := am.send(ctx, alerts)
|
||||
if err != nil {
|
||||
am.metrics.alertsSendErrors.Add(len(alerts))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (am *AlertManager) send(ctx context.Context, alerts []Alert) error {
|
||||
b := &bytes.Buffer{}
|
||||
writeamRequest(b, alerts, am.argFunc)
|
||||
|
||||
req, err := http.NewRequest("POST", am.alertURL, b)
|
||||
req, err := http.NewRequest("POST", am.addr, b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
if am.timeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
ctx, cancel = context.WithTimeout(ctx, am.timeout)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
req = req.WithContext(ctx)
|
||||
if am.basicAuthPass != "" {
|
||||
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
|
||||
|
||||
if am.authCfg != nil {
|
||||
if auth := am.authCfg.GetAuthHeader(); auth != "" {
|
||||
req.Header.Set("Authorization", auth)
|
||||
}
|
||||
}
|
||||
resp, err := am.client.Do(req)
|
||||
if err != nil {
|
||||
@@ -47,9 +90,9 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.addr, err)
|
||||
}
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.addr, string(body))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -60,14 +103,39 @@ type AlertURLGenerator func(Alert) string
|
||||
const alertManagerPath = "/api/v2/alerts"
|
||||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
return &AlertManager{
|
||||
addr: alertManagerURL,
|
||||
alertURL: url,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
basicAuthUser: user,
|
||||
basicAuthPass: pass,
|
||||
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, authCfg promauth.HTTPClientConfig, timeout time.Duration) (*AlertManager, error) {
|
||||
tls := &promauth.TLSConfig{}
|
||||
if authCfg.TLSConfig != nil {
|
||||
tls = authCfg.TLSConfig
|
||||
}
|
||||
tr, err := utils.Transport(alertManagerURL, tls.CertFile, tls.KeyFile, tls.CAFile, tls.ServerName, tls.InsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
|
||||
ba := new(promauth.BasicAuthConfig)
|
||||
oauth := new(promauth.OAuth2Config)
|
||||
if authCfg.BasicAuth != nil {
|
||||
ba = authCfg.BasicAuth
|
||||
}
|
||||
if authCfg.OAuth2 != nil {
|
||||
oauth = authCfg.OAuth2
|
||||
}
|
||||
|
||||
aCfg, err := utils.AuthConfig(
|
||||
utils.WithBasicAuth(ba.Username, ba.Password.String(), ba.PasswordFile),
|
||||
utils.WithBearer(authCfg.BearerToken.String(), authCfg.BearerTokenFile),
|
||||
utils.WithOAuth(oauth.ClientID, oauth.ClientSecretFile, oauth.ClientSecretFile, oauth.TokenURL, strings.Join(oauth.Scopes, ";")))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
|
||||
return &AlertManager{
|
||||
addr: alertManagerURL,
|
||||
argFunc: fn,
|
||||
authCfg: aCfg,
|
||||
client: &http.Client{Transport: tr},
|
||||
timeout: timeout,
|
||||
metrics: newMetrics(alertManagerURL),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -8,11 +8,16 @@ import (
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
)
|
||||
|
||||
func TestAlertManager_Addr(t *testing.T) {
|
||||
const addr = "http://localhost"
|
||||
am := NewAlertManager(addr, "", "", nil, nil)
|
||||
am, err := NewAlertManager(addr, nil, promauth.HTTPClientConfig{}, 0)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %s", err)
|
||||
}
|
||||
if am.Addr() != addr {
|
||||
t.Errorf("expected to have %q; got %q", addr, am.Addr())
|
||||
}
|
||||
@@ -75,9 +80,19 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
|
||||
|
||||
aCfg := promauth.HTTPClientConfig{
|
||||
BasicAuth: &promauth.BasicAuthConfig{
|
||||
Username: baUser,
|
||||
Password: promauth.NewSecret(baPass),
|
||||
},
|
||||
}
|
||||
am, err := NewAlertManager(srv.URL+alertManagerPath, func(alert Alert) string {
|
||||
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
|
||||
}, srv.Client())
|
||||
}, aCfg, 0)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %s", err)
|
||||
}
|
||||
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
|
||||
t.Error("expected connection error got nil")
|
||||
}
|
||||
|
||||
182
app/vmalert/notifier/config.go
Normal file
182
app/vmalert/notifier/config.go
Normal file
@@ -0,0 +1,182 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// Config contains list of supported configuration settings
|
||||
// for Notifier
|
||||
type Config struct {
|
||||
// Scheme defines the HTTP scheme for Notifier address
|
||||
Scheme string `yaml:"scheme,omitempty"`
|
||||
// PathPrefix is added to URL path before adding alertManagerPath value
|
||||
PathPrefix string `yaml:"path_prefix,omitempty"`
|
||||
|
||||
// ConsulSDConfigs contains list of settings for service discovery via Consul
|
||||
// see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config
|
||||
ConsulSDConfigs []consul.SDConfig `yaml:"consul_sd_configs,omitempty"`
|
||||
// StaticConfigs contains list of static targets
|
||||
StaticConfigs []StaticConfig `yaml:"static_configs,omitempty"`
|
||||
|
||||
// HTTPClientConfig contains HTTP configuration for Notifier clients
|
||||
HTTPClientConfig promauth.HTTPClientConfig `yaml:",inline"`
|
||||
// RelabelConfigs contains list of relabeling rules
|
||||
RelabelConfigs []promrelabel.RelabelConfig `yaml:"relabel_configs,omitempty"`
|
||||
|
||||
// The timeout used when sending alerts.
|
||||
Timeout promutils.Duration `yaml:"timeout,omitempty"`
|
||||
|
||||
// Checksum stores the hash of yaml definition for the config.
|
||||
// May be used to detect any changes to the config file.
|
||||
Checksum string
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
|
||||
// This is set to the directory from where the config has been loaded.
|
||||
baseDir string
|
||||
|
||||
// stores already parsed RelabelConfigs object
|
||||
parsedRelabelConfigs *promrelabel.ParsedConfigs
|
||||
}
|
||||
|
||||
// StaticConfig contains list of static targets in the following form:
|
||||
// targets:
|
||||
// [ - '<host>' ]
|
||||
type StaticConfig struct {
|
||||
Targets []string `yaml:"targets"`
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (cfg *Config) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type config Config
|
||||
if err := unmarshal((*config)(cfg)); err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Scheme == "" {
|
||||
cfg.Scheme = "http"
|
||||
}
|
||||
if cfg.Timeout.Duration() == 0 {
|
||||
cfg.Timeout = promutils.NewDuration(time.Second * 10)
|
||||
}
|
||||
rCfg, err := promrelabel.ParseRelabelConfigs(cfg.RelabelConfigs, false)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse relabeling config: %w", err)
|
||||
}
|
||||
cfg.parsedRelabelConfigs = rCfg
|
||||
|
||||
b, err := yaml.Marshal(cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal configuration for checksum: %w", err)
|
||||
}
|
||||
h := md5.New()
|
||||
h.Write(b)
|
||||
cfg.Checksum = fmt.Sprintf("%x", h.Sum(nil))
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseConfig(path string) (*Config, error) {
|
||||
data, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading config file: %w", err)
|
||||
}
|
||||
var cfg *Config
|
||||
err = yaml.Unmarshal(data, &cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(cfg.XXX) > 0 {
|
||||
var keys []string
|
||||
for k := range cfg.XXX {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
return nil, fmt.Errorf("unknown fields in %s", strings.Join(keys, ", "))
|
||||
}
|
||||
absPath, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot obtain abs path for %q: %w", path, err)
|
||||
}
|
||||
cfg.baseDir = filepath.Dir(absPath)
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func parseLabels(target string, metaLabels map[string]string, cfg *Config) (string, []prompbmarshal.Label, error) {
|
||||
labels := mergeLabels(target, metaLabels, cfg)
|
||||
labels = cfg.parsedRelabelConfigs.Apply(labels, 0, false)
|
||||
labels = promrelabel.RemoveMetaLabels(labels[:0], labels)
|
||||
// Remove references to already deleted labels, so GC could clean strings for label name and label value past len(labels).
|
||||
// This should reduce memory usage when relabeling creates big number of temporary labels with long names and/or values.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825 for details.
|
||||
labels = append([]prompbmarshal.Label{}, labels...)
|
||||
|
||||
if len(labels) == 0 {
|
||||
return "", nil, nil
|
||||
}
|
||||
schemeRelabeled := promrelabel.GetLabelValueByName(labels, "__scheme__")
|
||||
if len(schemeRelabeled) == 0 {
|
||||
schemeRelabeled = "http"
|
||||
}
|
||||
addressRelabeled := promrelabel.GetLabelValueByName(labels, "__address__")
|
||||
if len(addressRelabeled) == 0 {
|
||||
return "", nil, nil
|
||||
}
|
||||
if strings.Contains(addressRelabeled, "/") {
|
||||
return "", nil, nil
|
||||
}
|
||||
addressRelabeled = addMissingPort(schemeRelabeled, addressRelabeled)
|
||||
alertsPathRelabeled := promrelabel.GetLabelValueByName(labels, "__alerts_path__")
|
||||
if !strings.HasPrefix(alertsPathRelabeled, "/") {
|
||||
alertsPathRelabeled = "/" + alertsPathRelabeled
|
||||
}
|
||||
u := fmt.Sprintf("%s://%s%s", schemeRelabeled, addressRelabeled, alertsPathRelabeled)
|
||||
if _, err := url.Parse(u); err != nil {
|
||||
return "", nil, fmt.Errorf("invalid url %q for scheme=%q (%q), target=%q, metrics_path=%q (%q): %w",
|
||||
u, cfg.Scheme, schemeRelabeled, target, addressRelabeled, alertsPathRelabeled, err)
|
||||
}
|
||||
return u, labels, nil
|
||||
}
|
||||
|
||||
func addMissingPort(scheme, target string) string {
|
||||
if strings.Contains(target, ":") {
|
||||
return target
|
||||
}
|
||||
if scheme == "https" {
|
||||
target += ":443"
|
||||
} else {
|
||||
target += ":80"
|
||||
}
|
||||
return target
|
||||
}
|
||||
|
||||
func mergeLabels(target string, metaLabels map[string]string, cfg *Config) []prompbmarshal.Label {
|
||||
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
|
||||
m := make(map[string]string)
|
||||
m["__address__"] = target
|
||||
m["__scheme__"] = cfg.Scheme
|
||||
m["__alerts_path__"] = path.Join("/", cfg.PathPrefix, alertManagerPath)
|
||||
for k, v := range metaLabels {
|
||||
m[k] = v
|
||||
}
|
||||
result := make([]prompbmarshal.Label, 0, len(m))
|
||||
for k, v := range m {
|
||||
result = append(result, prompbmarshal.Label{
|
||||
Name: k,
|
||||
Value: v,
|
||||
})
|
||||
}
|
||||
return result
|
||||
}
|
||||
31
app/vmalert/notifier/config_test.go
Normal file
31
app/vmalert/notifier/config_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConfigParseGood(t *testing.T) {
|
||||
f := func(path string) {
|
||||
_, err := parseConfig(path)
|
||||
checkErr(t, err)
|
||||
}
|
||||
f("testdata/mixed.good.yaml")
|
||||
f("testdata/consul.good.yaml")
|
||||
f("testdata/static.good.yaml")
|
||||
}
|
||||
|
||||
func TestConfigParseBad(t *testing.T) {
|
||||
f := func(path, expErr string) {
|
||||
_, err := parseConfig(path)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get non-nil err for config %q", path)
|
||||
}
|
||||
if !strings.Contains(err.Error(), expErr) {
|
||||
t.Errorf("expected err to contain %q; got %q instead", expErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
f("testdata/unknownFields.bad.yaml", "unknown field")
|
||||
f("non-existing-file", "error reading")
|
||||
}
|
||||
235
app/vmalert/notifier/config_watcher.go
Normal file
235
app/vmalert/notifier/config_watcher.go
Normal file
@@ -0,0 +1,235 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul"
|
||||
)
|
||||
|
||||
// configWatcher supports dynamic reload of Notifier objects
|
||||
// from static configuration and service discovery.
|
||||
// Use newWatcher to create a new object.
|
||||
type configWatcher struct {
|
||||
cfg *Config
|
||||
genFn AlertURLGenerator
|
||||
wg sync.WaitGroup
|
||||
|
||||
reloadCh chan struct{}
|
||||
syncCh chan struct{}
|
||||
|
||||
targetsMu sync.RWMutex
|
||||
targets map[TargetType][]Target
|
||||
}
|
||||
|
||||
func newWatcher(path string, gen AlertURLGenerator) (*configWatcher, error) {
|
||||
cfg, err := parseConfig(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cw := &configWatcher{
|
||||
cfg: cfg,
|
||||
wg: sync.WaitGroup{},
|
||||
reloadCh: make(chan struct{}, 1),
|
||||
syncCh: make(chan struct{}),
|
||||
genFn: gen,
|
||||
targetsMu: sync.RWMutex{},
|
||||
targets: make(map[TargetType][]Target),
|
||||
}
|
||||
return cw, cw.start()
|
||||
}
|
||||
|
||||
func (cw *configWatcher) notifiers() []Notifier {
|
||||
cw.targetsMu.RLock()
|
||||
defer cw.targetsMu.RUnlock()
|
||||
|
||||
var notifiers []Notifier
|
||||
for _, ns := range cw.targets {
|
||||
for _, n := range ns {
|
||||
notifiers = append(notifiers, n.Notifier)
|
||||
}
|
||||
|
||||
}
|
||||
return notifiers
|
||||
}
|
||||
|
||||
func (cw *configWatcher) reload(path string) error {
|
||||
select {
|
||||
case cw.reloadCh <- struct{}{}:
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
defer func() { <-cw.reloadCh }()
|
||||
|
||||
cfg, err := parseConfig(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Checksum == cw.cfg.Checksum {
|
||||
return nil
|
||||
}
|
||||
|
||||
// stop existing discovery
|
||||
cw.mustStop()
|
||||
|
||||
// re-start cw with new config
|
||||
cw.syncCh = make(chan struct{})
|
||||
cw.cfg = cfg
|
||||
return cw.start()
|
||||
}
|
||||
|
||||
func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn getLabels) error {
|
||||
targets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
|
||||
for _, err := range errors {
|
||||
return fmt.Errorf("failed to init notifier for %q: %s", typeK, err)
|
||||
}
|
||||
|
||||
cw.setTargets(typeK, targets)
|
||||
|
||||
cw.wg.Add(1)
|
||||
go func() {
|
||||
defer cw.wg.Done()
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-cw.syncCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
updateTargets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
|
||||
for _, err := range errors {
|
||||
logger.Errorf("failed to init notifier for %q: %s", typeK, err)
|
||||
}
|
||||
cw.setTargets(typeK, updateTargets)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func targetsFromLabels(labelsFn getLabels, cfg *Config, genFn AlertURLGenerator) ([]Target, []error) {
|
||||
metaLabels, err := labelsFn()
|
||||
if err != nil {
|
||||
return nil, []error{fmt.Errorf("failed to get labels: %s", err)}
|
||||
}
|
||||
var targets []Target
|
||||
var errors []error
|
||||
duplicates := make(map[string]struct{})
|
||||
for _, labels := range metaLabels {
|
||||
target := labels["__address__"]
|
||||
u, processedLabels, err := parseLabels(target, labels, cfg)
|
||||
if err != nil {
|
||||
errors = append(errors, err)
|
||||
continue
|
||||
}
|
||||
if len(u) == 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := duplicates[u]; ok { // check for duplicates
|
||||
if !*suppressDuplicateTargetErrors {
|
||||
logger.Errorf("skipping duplicate target with identical address %q; "+
|
||||
"make sure service discovery and relabeling is set up properly; "+
|
||||
"original labels: %s; resulting labels: %s",
|
||||
u, labels, processedLabels)
|
||||
}
|
||||
continue
|
||||
}
|
||||
duplicates[u] = struct{}{}
|
||||
|
||||
am, err := NewAlertManager(u, genFn, cfg.HTTPClientConfig, cfg.Timeout.Duration())
|
||||
if err != nil {
|
||||
errors = append(errors, err)
|
||||
continue
|
||||
}
|
||||
targets = append(targets, Target{
|
||||
Notifier: am,
|
||||
Labels: processedLabels,
|
||||
})
|
||||
}
|
||||
return targets, errors
|
||||
}
|
||||
|
||||
type getLabels func() ([]map[string]string, error)
|
||||
|
||||
func (cw *configWatcher) start() error {
|
||||
if len(cw.cfg.StaticConfigs) > 0 {
|
||||
var targets []Target
|
||||
for _, cfg := range cw.cfg.StaticConfigs {
|
||||
for _, target := range cfg.Targets {
|
||||
address, labels, err := parseLabels(target, nil, cw.cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse labels for target %q: %s", target, err)
|
||||
}
|
||||
notifier, err := NewAlertManager(address, cw.genFn, cw.cfg.HTTPClientConfig, cw.cfg.Timeout.Duration())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init alertmanager for addr %q: %s", address, err)
|
||||
}
|
||||
targets = append(targets, Target{
|
||||
Notifier: notifier,
|
||||
Labels: labels,
|
||||
})
|
||||
}
|
||||
}
|
||||
cw.setTargets(TargetStatic, targets)
|
||||
}
|
||||
|
||||
if len(cw.cfg.ConsulSDConfigs) > 0 {
|
||||
err := cw.add(TargetConsul, *consul.SDCheckInterval, func() ([]map[string]string, error) {
|
||||
var labels []map[string]string
|
||||
for i := range cw.cfg.ConsulSDConfigs {
|
||||
sdc := &cw.cfg.ConsulSDConfigs[i]
|
||||
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("got labels err: %s", err)
|
||||
}
|
||||
labels = append(labels, targetLabels...)
|
||||
}
|
||||
return labels, nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start consulSD discovery: %s", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cw *configWatcher) mustStop() {
|
||||
close(cw.syncCh)
|
||||
cw.wg.Wait()
|
||||
|
||||
cw.targetsMu.Lock()
|
||||
for _, targets := range cw.targets {
|
||||
for _, t := range targets {
|
||||
t.Close()
|
||||
}
|
||||
}
|
||||
cw.targets = make(map[TargetType][]Target)
|
||||
cw.targetsMu.Unlock()
|
||||
|
||||
for i := range cw.cfg.ConsulSDConfigs {
|
||||
cw.cfg.ConsulSDConfigs[i].MustStop()
|
||||
}
|
||||
cw.cfg = nil
|
||||
}
|
||||
|
||||
func (cw *configWatcher) setTargets(key TargetType, targets []Target) {
|
||||
cw.targetsMu.Lock()
|
||||
newT := make(map[string]Target)
|
||||
for _, t := range targets {
|
||||
newT[t.Addr()] = t
|
||||
}
|
||||
oldT := cw.targets[key]
|
||||
|
||||
for _, ot := range oldT {
|
||||
if _, ok := newT[ot.Addr()]; !ok {
|
||||
ot.Notifier.Close()
|
||||
}
|
||||
}
|
||||
cw.targets[key] = targets
|
||||
cw.targetsMu.Unlock()
|
||||
}
|
||||
301
app/vmalert/notifier/config_watcher_test.go
Normal file
301
app/vmalert/notifier/config_watcher_test.go
Normal file
@@ -0,0 +1,301 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConfigWatcherReload(t *testing.T) {
|
||||
f, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(f.Name()) }()
|
||||
|
||||
writeToFile(t, f.Name(), `
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9094
|
||||
`)
|
||||
cw, err := newWatcher(f.Name(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
defer cw.mustStop()
|
||||
ns := cw.notifiers()
|
||||
if len(ns) != 2 {
|
||||
t.Fatalf("expected to have 2 notifiers; got %d %#v", len(ns), ns)
|
||||
}
|
||||
|
||||
f2, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(f2.Name()) }()
|
||||
|
||||
writeToFile(t, f2.Name(), `
|
||||
static_configs:
|
||||
- targets:
|
||||
- 127.0.0.1:9093
|
||||
`)
|
||||
checkErr(t, cw.reload(f2.Name()))
|
||||
|
||||
ns = cw.notifiers()
|
||||
if len(ns) != 1 {
|
||||
t.Fatalf("expected to have 1 notifier; got %d", len(ns))
|
||||
}
|
||||
expAddr := "http://127.0.0.1:9093/api/v2/alerts"
|
||||
if ns[0].Addr() != expAddr {
|
||||
t.Fatalf("expected to get %q; got %q instead", expAddr, ns[0].Addr())
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfigWatcherStart(t *testing.T) {
|
||||
consulSDServer := newFakeConsulServer()
|
||||
defer consulSDServer.Close()
|
||||
|
||||
consulSDFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(consulSDFile.Name()) }()
|
||||
|
||||
writeToFile(t, consulSDFile.Name(), fmt.Sprintf(`
|
||||
scheme: https
|
||||
path_prefix: proxy
|
||||
consul_sd_configs:
|
||||
- server: %s
|
||||
services:
|
||||
- alertmanager
|
||||
`, consulSDServer.URL))
|
||||
|
||||
cw, err := newWatcher(consulSDFile.Name(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
defer cw.mustStop()
|
||||
|
||||
if len(cw.notifiers()) != 2 {
|
||||
t.Fatalf("expected to get 2 notifiers; got %d", len(cw.notifiers()))
|
||||
}
|
||||
|
||||
expAddr1 := fmt.Sprintf("https://%s/proxy/api/v2/alerts", fakeConsulService1)
|
||||
expAddr2 := fmt.Sprintf("https://%s/proxy/api/v2/alerts", fakeConsulService2)
|
||||
|
||||
n1, n2 := cw.notifiers()[0], cw.notifiers()[1]
|
||||
if n1.Addr() != expAddr1 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr1, n1.Addr())
|
||||
}
|
||||
if n2.Addr() != expAddr2 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr2, n2.Addr())
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfigWatcherReloadConcurrent supposed to test concurrent
|
||||
// execution of configuration update.
|
||||
// Should be executed with -race flag
|
||||
func TestConfigWatcherReloadConcurrent(t *testing.T) {
|
||||
consulSDServer1 := newFakeConsulServer()
|
||||
defer consulSDServer1.Close()
|
||||
consulSDServer2 := newFakeConsulServer()
|
||||
defer consulSDServer2.Close()
|
||||
|
||||
consulSDFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(consulSDFile.Name()) }()
|
||||
|
||||
writeToFile(t, consulSDFile.Name(), fmt.Sprintf(`
|
||||
consul_sd_configs:
|
||||
- server: %s
|
||||
services:
|
||||
- alertmanager
|
||||
- server: %s
|
||||
services:
|
||||
- consul
|
||||
`, consulSDServer1.URL, consulSDServer2.URL))
|
||||
|
||||
staticAndConsulSDFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(staticAndConsulSDFile.Name()) }()
|
||||
|
||||
writeToFile(t, staticAndConsulSDFile.Name(), fmt.Sprintf(`
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
consul_sd_configs:
|
||||
- server: %s
|
||||
services:
|
||||
- alertmanager
|
||||
- server: %s
|
||||
services:
|
||||
- consul
|
||||
`, consulSDServer1.URL, consulSDServer2.URL))
|
||||
|
||||
paths := []string{
|
||||
staticAndConsulSDFile.Name(),
|
||||
consulSDFile.Name(),
|
||||
"testdata/static.good.yaml",
|
||||
"unknownFields.bad.yaml",
|
||||
}
|
||||
|
||||
cw, err := newWatcher(paths[0], nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
defer cw.mustStop()
|
||||
|
||||
const workers = 500
|
||||
const iterations = 10
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
_ = cw.reload(paths[rnd]) // update can fail and this is expected
|
||||
_ = cw.notifiers()
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func writeToFile(t *testing.T, file, b string) {
|
||||
t.Helper()
|
||||
checkErr(t, ioutil.WriteFile(file, []byte(b), 0644))
|
||||
}
|
||||
|
||||
func checkErr(t *testing.T, err error) {
|
||||
t.Helper()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
fakeConsulService1 = "127.0.0.1:9093"
|
||||
fakeConsulService2 = "127.0.0.1:9095"
|
||||
)
|
||||
|
||||
func newFakeConsulServer() *httptest.Server {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/v1/agent/self", func(rw http.ResponseWriter, _ *http.Request) {
|
||||
rw.Write([]byte(`{"Config": {"Datacenter": "dc1"}}`))
|
||||
})
|
||||
mux.HandleFunc("/v1/catalog/services", func(rw http.ResponseWriter, _ *http.Request) {
|
||||
rw.Header().Set("X-Consul-Index", "1")
|
||||
rw.Write([]byte(`{
|
||||
"alertmanager": [
|
||||
"alertmanager",
|
||||
"__scheme__=http"
|
||||
]
|
||||
}`))
|
||||
})
|
||||
mux.HandleFunc("/v1/health/service/alertmanager", func(rw http.ResponseWriter, _ *http.Request) {
|
||||
rw.Header().Set("X-Consul-Index", "1")
|
||||
rw.Write([]byte(`
|
||||
[
|
||||
{
|
||||
"Node": {
|
||||
"ID": "e8e3629a-3f50-9d6e-aaf8-f173b5b05c72",
|
||||
"Node": "machine",
|
||||
"Address": "127.0.0.1",
|
||||
"Datacenter": "dc1",
|
||||
"TaggedAddresses": {
|
||||
"lan": "127.0.0.1",
|
||||
"lan_ipv4": "127.0.0.1",
|
||||
"wan": "127.0.0.1",
|
||||
"wan_ipv4": "127.0.0.1"
|
||||
},
|
||||
"Meta": {
|
||||
"consul-network-segment": ""
|
||||
},
|
||||
"CreateIndex": 13,
|
||||
"ModifyIndex": 14
|
||||
},
|
||||
"Service": {
|
||||
"ID": "am1",
|
||||
"Service": "alertmanager",
|
||||
"Tags": [
|
||||
"alertmanager",
|
||||
"__scheme__=http"
|
||||
],
|
||||
"Address": "",
|
||||
"Meta": null,
|
||||
"Port": 9093,
|
||||
"Weights": {
|
||||
"Passing": 1,
|
||||
"Warning": 1
|
||||
},
|
||||
"EnableTagOverride": false,
|
||||
"Proxy": {
|
||||
"Mode": "",
|
||||
"MeshGateway": {},
|
||||
"Expose": {}
|
||||
},
|
||||
"Connect": {},
|
||||
"CreateIndex": 16,
|
||||
"ModifyIndex": 16
|
||||
}
|
||||
},
|
||||
{
|
||||
"Node": {
|
||||
"ID": "e8e3629a-3f50-9d6e-aaf8-f173b5b05c72",
|
||||
"Node": "machine",
|
||||
"Address": "127.0.0.1",
|
||||
"Datacenter": "dc1",
|
||||
"TaggedAddresses": {
|
||||
"lan": "127.0.0.1",
|
||||
"lan_ipv4": "127.0.0.1",
|
||||
"wan": "127.0.0.1",
|
||||
"wan_ipv4": "127.0.0.1"
|
||||
},
|
||||
"Meta": {
|
||||
"consul-network-segment": ""
|
||||
},
|
||||
"CreateIndex": 13,
|
||||
"ModifyIndex": 14
|
||||
},
|
||||
"Service": {
|
||||
"ID": "am2",
|
||||
"Service": "alertmanager",
|
||||
"Tags": [
|
||||
"alertmanager",
|
||||
"bad-node"
|
||||
],
|
||||
"Address": "",
|
||||
"Meta": null,
|
||||
"Port": 9095,
|
||||
"Weights": {
|
||||
"Passing": 1,
|
||||
"Warning": 1
|
||||
},
|
||||
"EnableTagOverride": false,
|
||||
"Proxy": {
|
||||
"Mode": "",
|
||||
"MeshGateway": {},
|
||||
"Expose": {}
|
||||
},
|
||||
"Connect": {},
|
||||
"CreateIndex": 15,
|
||||
"ModifyIndex": 15
|
||||
}
|
||||
}
|
||||
]`))
|
||||
})
|
||||
|
||||
return httptest.NewServer(mux)
|
||||
}
|
||||
@@ -1,17 +1,28 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
var (
|
||||
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL, e.g. http://127.0.0.1:9093")
|
||||
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -notifier.url")
|
||||
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -notifier.url")
|
||||
configPath = flag.String("notifier.config", "", "Path to configuration file for notifiers")
|
||||
suppressDuplicateTargetErrors = flag.Bool("notifier.suppressDuplicateTargetErrors", false, "Whether to suppress 'duplicate target' errors during discovery")
|
||||
|
||||
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL, e.g. http://127.0.0.1:9093")
|
||||
|
||||
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -notifier.url")
|
||||
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -notifier.url")
|
||||
basicAuthPasswordFile = flagutil.NewArray("notifier.basicAuth.passwordFile", "Optional path to basic auth password file for -notifier.url")
|
||||
|
||||
bearerToken = flagutil.NewArray("notifier.bearerToken", "Optional bearer token for -notifier.url")
|
||||
bearerTokenFile = flagutil.NewArray("notifier.bearerTokenFile", "Optional path to bearer token file for -notifier.url")
|
||||
|
||||
tlsInsecureSkipVerify = flagutil.NewArrayBool("notifier.tlsInsecureSkipVerify", "Whether to skip tls verification when connecting to -notifier.url")
|
||||
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
|
||||
@@ -20,22 +31,158 @@ var (
|
||||
"By default system CA is used")
|
||||
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
|
||||
"By default the server name from -notifier.url is used")
|
||||
|
||||
oauth2ClientID = flagutil.NewArray("notifier.oauth2.clientID", "Optional OAuth2 clientID to use for -notifier.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
|
||||
oauth2ClientSecret = flagutil.NewArray("notifier.oauth2.clientSecret", "Optional OAuth2 clientSecret to use for -notifier.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
|
||||
oauth2ClientSecretFile = flagutil.NewArray("notifier.oauth2.clientSecretFile", "Optional OAuth2 clientSecretFile to use for -notifier.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
|
||||
oauth2TokenURL = flagutil.NewArray("notifier.oauth2.tokenUrl", "Optional OAuth2 tokenURL to use for -notifier.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
|
||||
oauth2Scopes = flagutil.NewArray("notifier.oauth2.scopes", "Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -notifier.url")
|
||||
)
|
||||
|
||||
// Init creates a Notifier object based on provided flags.
|
||||
func Init(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
|
||||
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
|
||||
tr, err := utils.Transport(addr, cert, key, ca, serverName, tlsInsecureSkipVerify.GetOptionalArg(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
|
||||
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
|
||||
notifiers = append(notifiers, am)
|
||||
// cw holds a configWatcher for configPath configuration file
|
||||
// configWatcher provides a list of Notifier objects discovered
|
||||
// from static config or via service discovery.
|
||||
// cw is not nil only if configPath is provided.
|
||||
var cw *configWatcher
|
||||
|
||||
// Reload checks the changes in configPath configuration file
|
||||
// and applies changes if any.
|
||||
func Reload() error {
|
||||
if cw == nil {
|
||||
return nil
|
||||
}
|
||||
return cw.reload(*configPath)
|
||||
}
|
||||
|
||||
var staticNotifiersFn func() []Notifier
|
||||
|
||||
var (
|
||||
// externalLabels is a global variable for holding external labels configured via flags
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalLabels map[string]string
|
||||
// externalURL is a global variable for holding external URL value configured via flag
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalURL string
|
||||
)
|
||||
|
||||
// Init returns a function for retrieving actual list of Notifier objects.
|
||||
// Init works in two mods:
|
||||
// * configuration via flags (for backward compatibility). Is always static
|
||||
// and don't support live reloads.
|
||||
// * configuration via file. Supports live reloads and service discovery.
|
||||
// Init returns an error if both mods are used.
|
||||
func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (func() []Notifier, error) {
|
||||
if externalLabels != nil || externalURL != "" {
|
||||
return nil, fmt.Errorf("BUG: notifier.Init was called multiple times")
|
||||
}
|
||||
|
||||
externalURL = extURL
|
||||
externalLabels = extLabels
|
||||
|
||||
if *configPath == "" && len(*addrs) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
if *configPath != "" && len(*addrs) > 0 {
|
||||
return nil, fmt.Errorf("only one of -notifier.config or -notifier.url flags must be specified")
|
||||
}
|
||||
|
||||
if len(*addrs) > 0 {
|
||||
notifiers, err := notifiersFromFlags(gen)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create notifier from flag values: %s", err)
|
||||
}
|
||||
staticNotifiersFn = func() []Notifier {
|
||||
return notifiers
|
||||
}
|
||||
return staticNotifiersFn, nil
|
||||
}
|
||||
|
||||
var err error
|
||||
cw, err = newWatcher(*configPath, gen)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init config watcher: %s", err)
|
||||
}
|
||||
return cw.notifiers, nil
|
||||
}
|
||||
|
||||
func notifiersFromFlags(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
authCfg := promauth.HTTPClientConfig{
|
||||
TLSConfig: &promauth.TLSConfig{
|
||||
CAFile: tlsCAFile.GetOptionalArg(i),
|
||||
CertFile: tlsCertFile.GetOptionalArg(i),
|
||||
KeyFile: tlsKeyFile.GetOptionalArg(i),
|
||||
ServerName: tlsServerName.GetOptionalArg(i),
|
||||
InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(i),
|
||||
},
|
||||
BasicAuth: &promauth.BasicAuthConfig{
|
||||
Username: basicAuthUsername.GetOptionalArg(i),
|
||||
Password: promauth.NewSecret(basicAuthPassword.GetOptionalArg(i)),
|
||||
PasswordFile: basicAuthPasswordFile.GetOptionalArg(i),
|
||||
},
|
||||
BearerToken: promauth.NewSecret(bearerToken.GetOptionalArg(i)),
|
||||
BearerTokenFile: bearerTokenFile.GetOptionalArg(i),
|
||||
OAuth2: &promauth.OAuth2Config{
|
||||
ClientID: oauth2ClientID.GetOptionalArg(i),
|
||||
ClientSecret: promauth.NewSecret(oauth2ClientSecret.GetOptionalArg(i)),
|
||||
ClientSecretFile: oauth2ClientSecretFile.GetOptionalArg(i),
|
||||
Scopes: strings.Split(oauth2Scopes.GetOptionalArg(i), ";"),
|
||||
TokenURL: oauth2TokenURL.GetOptionalArg(i),
|
||||
},
|
||||
}
|
||||
|
||||
addr = strings.TrimSuffix(addr, "/")
|
||||
am, err := NewAlertManager(addr+alertManagerPath, gen, authCfg, time.Minute)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
notifiers = append(notifiers, am)
|
||||
}
|
||||
return notifiers, nil
|
||||
}
|
||||
|
||||
// Target represents a Notifier and optional
|
||||
// list of labels added during discovery.
|
||||
type Target struct {
|
||||
Notifier
|
||||
Labels []prompbmarshal.Label
|
||||
}
|
||||
|
||||
// TargetType defines how the Target was discovered
|
||||
type TargetType string
|
||||
|
||||
const (
|
||||
// TargetStatic is for targets configured statically
|
||||
TargetStatic TargetType = "static"
|
||||
// TargetConsul is for targets discovered via Consul
|
||||
TargetConsul TargetType = "consulSD"
|
||||
)
|
||||
|
||||
// GetTargets returns list of static or discovered targets
|
||||
// via notifier configuration.
|
||||
func GetTargets() map[TargetType][]Target {
|
||||
var targets = make(map[TargetType][]Target)
|
||||
|
||||
if staticNotifiersFn != nil {
|
||||
for _, ns := range staticNotifiersFn() {
|
||||
targets[TargetStatic] = append(targets[TargetStatic], Target{
|
||||
Notifier: ns,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if cw != nil {
|
||||
cw.targetsMu.RLock()
|
||||
for key, ns := range cw.targets {
|
||||
targets[key] = append(targets[key], ns...)
|
||||
}
|
||||
cw.targetsMu.RUnlock()
|
||||
}
|
||||
return targets
|
||||
}
|
||||
|
||||
@@ -10,4 +10,6 @@ type Notifier interface {
|
||||
Send(ctx context.Context, alerts []Alert) error
|
||||
// Addr returns address where alerts are sent.
|
||||
Addr() string
|
||||
// Close is a destructor for the Notifier
|
||||
Close()
|
||||
}
|
||||
|
||||
@@ -17,8 +17,10 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"net"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -26,6 +28,7 @@ import (
|
||||
textTpl "text/template"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// metric is private copy of datasource.Metric,
|
||||
@@ -61,6 +64,7 @@ var tmplFunc textTpl.FuncMap
|
||||
|
||||
// InitTemplateFunc initiates template helper functions
|
||||
func InitTemplateFunc(externalURL *url.URL) {
|
||||
// See https://prometheus.io/docs/prometheus/latest/configuration/template_reference/
|
||||
tmplFunc = textTpl.FuncMap{
|
||||
/* Strings */
|
||||
|
||||
@@ -91,6 +95,24 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
// alias for https://golang.org/pkg/strings/#ToLower
|
||||
"toLower": strings.ToLower,
|
||||
|
||||
// stripPort splits string into host and port, then returns only host.
|
||||
"stripPort": func(hostPort string) string {
|
||||
host, _, err := net.SplitHostPort(hostPort)
|
||||
if err != nil {
|
||||
return hostPort
|
||||
}
|
||||
return host
|
||||
},
|
||||
|
||||
// parseDuration parses a duration string such as "1h" into the number of seconds it represents
|
||||
"parseDuration": func(s string) (float64, error) {
|
||||
d, err := promutils.ParseDuration(s)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return d.Seconds(), nil
|
||||
},
|
||||
|
||||
/* Numbers */
|
||||
|
||||
// humanize converts given number to a human readable format
|
||||
@@ -261,6 +283,14 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
return m.Labels[label]
|
||||
},
|
||||
|
||||
// sortByLabel sorts the given metrics by provided label key
|
||||
"sortByLabel": func(label string, metrics []metric) []metric {
|
||||
sort.SliceStable(metrics, func(i, j int) bool {
|
||||
return metrics[i].Labels[label] < metrics[j].Labels[label]
|
||||
})
|
||||
return metrics
|
||||
},
|
||||
|
||||
// value returns the value of the given metric.
|
||||
// usually used alongside with `query` template function.
|
||||
"value": func(m metric) float64 {
|
||||
|
||||
13
app/vmalert/notifier/testdata/consul.good.yaml
vendored
Normal file
13
app/vmalert/notifier/testdata/consul.good.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
scheme: http
|
||||
services:
|
||||
- alertmanager
|
||||
- server: localhost:8500
|
||||
services:
|
||||
- consul
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_consul_tags]
|
||||
regex: .*,__scheme__=([^,]+),.*
|
||||
replacement: '${1}'
|
||||
target_label: __scheme__
|
||||
18
app/vmalert/notifier/testdata/mixed.good.yaml
vendored
Normal file
18
app/vmalert/notifier/testdata/mixed.good.yaml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
scheme: http
|
||||
services:
|
||||
- alertmanager
|
||||
- server: localhost:8500
|
||||
services:
|
||||
- consul
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_consul_tags]
|
||||
regex: .*,__scheme__=([^,]+),.*
|
||||
replacement: '${1}'
|
||||
target_label: __scheme__
|
||||
4
app/vmalert/notifier/testdata/static.good.yaml
vendored
Normal file
4
app/vmalert/notifier/testdata/static.good.yaml
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
5
app/vmalert/notifier/testdata/unknownFields.bad.yaml
vendored
Normal file
5
app/vmalert/notifier/testdata/unknownFields.bad.yaml
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
scheme: https
|
||||
unknown: field
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
@@ -10,8 +10,8 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// RecordingRule is a Rule that supposed
|
||||
@@ -31,6 +31,8 @@ type RecordingRule struct {
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores the duration of the last Exec call
|
||||
lastExecDuration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
@@ -43,8 +45,8 @@ type RecordingRule struct {
|
||||
}
|
||||
|
||||
type recordingRuleMetrics struct {
|
||||
errors *gauge
|
||||
samples *gauge
|
||||
errors *utils.Gauge
|
||||
samples *utils.Gauge
|
||||
}
|
||||
|
||||
// String implements Stringer interface
|
||||
@@ -75,7 +77,7 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
rr.mu.RLock()
|
||||
defer rr.mu.RUnlock()
|
||||
@@ -84,7 +86,7 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
||||
}
|
||||
return 1
|
||||
})
|
||||
rr.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
rr.mu.RLock()
|
||||
defer rr.mu.RUnlock()
|
||||
@@ -95,8 +97,8 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (rr *RecordingRule) Close() {
|
||||
metrics.UnregisterMetric(rr.metrics.errors.name)
|
||||
metrics.UnregisterMetric(rr.metrics.samples.name)
|
||||
rr.metrics.errors.Unregister()
|
||||
rr.metrics.samples.Unregister()
|
||||
}
|
||||
|
||||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||
@@ -122,12 +124,13 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
|
||||
}
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := rr.q.Query(ctx, rr.Expr)
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := rr.q.Query(ctx, rr.Expr, ts)
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
rr.lastExecTime = time.Now()
|
||||
rr.lastExecTime = ts
|
||||
rr.lastExecDuration = time.Since(ts)
|
||||
rr.lastExecError = err
|
||||
rr.lastExecSamples = len(qMetrics)
|
||||
if err != nil {
|
||||
@@ -193,23 +196,27 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RuleAPI returns Rule representation in form
|
||||
// of APIRecordingRule
|
||||
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
||||
var lastErr string
|
||||
if rr.lastExecError != nil {
|
||||
lastErr = rr.lastExecError.Error()
|
||||
}
|
||||
return APIRecordingRule{
|
||||
// ToAPI returns Rule's representation in form
|
||||
// of APIRule
|
||||
func (rr *RecordingRule) ToAPI() APIRule {
|
||||
r := APIRule{
|
||||
Type: "recording",
|
||||
DatasourceType: rr.Type.String(),
|
||||
Name: rr.Name,
|
||||
Query: rr.Expr,
|
||||
Labels: rr.Labels,
|
||||
LastEvaluation: rr.lastExecTime,
|
||||
EvaluationTime: rr.lastExecDuration.Seconds(),
|
||||
Health: "ok",
|
||||
LastSamples: rr.lastExecSamples,
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
Name: rr.Name,
|
||||
Type: rr.Type.String(),
|
||||
Expression: rr.Expr,
|
||||
LastError: lastErr,
|
||||
LastSamples: rr.lastExecSamples,
|
||||
LastExec: rr.lastExecTime,
|
||||
Labels: rr.Labels,
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
}
|
||||
|
||||
if rr.lastExecError != nil {
|
||||
r.LastError = rr.lastExecError.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ func TestRecoridngRule_Exec(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.Exec(context.TODO())
|
||||
tss, err := tc.rule.Exec(context.TODO(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
@@ -178,7 +178,7 @@ func TestRecoridngRule_ExecNegative(t *testing.T) {
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
rr.q = fq
|
||||
_, err := rr.Exec(context.TODO())
|
||||
_, err := rr.Exec(context.TODO(), time.Now())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -193,7 +193,7 @@ func TestRecoridngRule_ExecNegative(t *testing.T) {
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO())
|
||||
_, err = rr.Exec(context.TODO(), time.Now())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
|
||||
@@ -13,11 +13,13 @@ var (
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts "+
|
||||
"state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state. "+
|
||||
"E.g. http://127.0.0.1:8428. See also -remoteRead.disablePathAppend")
|
||||
|
||||
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
basicAuthPasswordFile = flag.String("remoteRead.basicAuth.passwordFile", "", "Optional path to basic auth password to use for -remoteRead.url")
|
||||
bearerToken = flag.String("remoteRead.bearerToken", "", "Optional bearer auth token to use for -remoteRead.url.")
|
||||
bearerTokenFile = flag.String("remoteRead.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteRead.url.")
|
||||
|
||||
bearerToken = flag.String("remoteRead.bearerToken", "", "Optional bearer auth token to use for -remoteRead.url.")
|
||||
bearerTokenFile = flag.String("remoteRead.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteRead.url.")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
|
||||
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
|
||||
@@ -26,6 +28,13 @@ var (
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
|
||||
"By default the server name from -remoteRead.url is used")
|
||||
|
||||
oauth2ClientID = flag.String("remoteRead.oauth2.clientID", "", "Optional OAuth2 clientID to use for -remoteRead.url.")
|
||||
oauth2ClientSecret = flag.String("remoteRead.oauth2.clientSecret", "", "Optional OAuth2 clientSecret to use for -remoteRead.url.")
|
||||
oauth2ClientSecretFile = flag.String("remoteRead.oauth2.clientSecretFile", "", "Optional OAuth2 clientSecretFile to use for -remoteRead.url.")
|
||||
oauth2TokenURL = flag.String("remoteRead.oauth2.tokenUrl", "", "Optional OAuth2 tokenURL to use for -remoteRead.url. ")
|
||||
oauth2Scopes = flag.String("remoteRead.oauth2.scopes", "", "Optional OAuth2 scopes to use for -remoteRead.url. Scopes must be delimited by ';'.")
|
||||
|
||||
disablePathAppend = flag.Bool("remoteRead.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url.")
|
||||
)
|
||||
|
||||
@@ -39,7 +48,11 @@ func Init() (datasource.QuerierBuilder, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
authCfg, err := utils.AuthConfig(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile, *bearerToken, *bearerTokenFile)
|
||||
|
||||
authCfg, err := utils.AuthConfig(
|
||||
utils.WithBasicAuth(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile),
|
||||
utils.WithBearer(*bearerToken, *bearerTokenFile),
|
||||
utils.WithOAuth(*oauth2ClientID, *oauth2ClientSecret, *oauth2ClientSecretFile, *oauth2TokenURL, *oauth2Scopes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
|
||||
@@ -13,11 +13,13 @@ var (
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to VictoriaMetrics or vminsert where to persist alerts state "+
|
||||
"and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, "+
|
||||
"then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend")
|
||||
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
basicAuthPasswordFile = flag.String("remoteWrite.basicAuth.passwordFile", "", "Optional path to basic auth password to use for -remoteWrite.url")
|
||||
bearerToken = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url.")
|
||||
bearerTokenFile = flag.String("remoteWrite.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteWrite.url.")
|
||||
|
||||
bearerToken = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url.")
|
||||
bearerTokenFile = flag.String("remoteWrite.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteWrite.url.")
|
||||
|
||||
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
|
||||
@@ -31,6 +33,13 @@ var (
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
|
||||
"By default the server name from -remoteWrite.url is used")
|
||||
|
||||
oauth2ClientID = flag.String("remoteWrite.oauth2.clientID", "", "Optional OAuth2 clientID to use for -remoteWrite.url.")
|
||||
oauth2ClientSecret = flag.String("remoteWrite.oauth2.clientSecret", "", "Optional OAuth2 clientSecret to use for -remoteWrite.url.")
|
||||
oauth2ClientSecretFile = flag.String("remoteWrite.oauth2.clientSecretFile", "", "Optional OAuth2 clientSecretFile to use for -remoteWrite.url.")
|
||||
oauth2TokenURL = flag.String("remoteWrite.oauth2.tokenUrl", "", "Optional OAuth2 tokenURL to use for -notifier.url.")
|
||||
oauth2Scopes = flag.String("remoteWrite.oauth2.scopes", "", "Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.")
|
||||
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
)
|
||||
|
||||
@@ -46,7 +55,10 @@ func Init(ctx context.Context) (*Client, error) {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
|
||||
authCfg, err := utils.AuthConfig(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile, *bearerToken, *bearerTokenFile)
|
||||
authCfg, err := utils.AuthConfig(
|
||||
utils.WithBasicAuth(*basicAuthUsername, *basicAuthPassword, *basicAuthPasswordFile),
|
||||
utils.WithBearer(*bearerToken, *bearerTokenFile),
|
||||
utils.WithOAuth(*oauth2ClientID, *oauth2ClientSecret, *oauth2ClientSecretFile, *oauth2TokenURL, *oauth2Scopes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("all %d attempts to send request failed - dropping %d timeseries",
|
||||
logger.Errorf("all %d attempts to send request failed - dropping %d time series",
|
||||
attempts, len(wr.Timeseries))
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
type fakeReplayQuerier struct {
|
||||
@@ -83,7 +83,7 @@ func TestReplay(t *testing.T) {
|
||||
to: "2021-01-01T15:02:30.000Z",
|
||||
maxDP: 60,
|
||||
cfg: []config.Group{
|
||||
{Interval: utils.NewPromDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
{Interval: promutils.NewDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
|
||||
@@ -3,8 +3,9 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
@@ -14,13 +15,15 @@ type Rule interface {
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error)
|
||||
// Exec executes the rule with given context at the given timestamp
|
||||
Exec(ctx context.Context, ts time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range
|
||||
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
// ToAPI converts Rule into APIRule
|
||||
ToAPI() APIRule
|
||||
// Close performs the shutdown procedures for rule
|
||||
// such as metrics unregister
|
||||
Close()
|
||||
|
||||
@@ -30,3 +30,20 @@ func newTimeSeries(values []float64, timestamps []int64, labels map[string]strin
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
// newTimeSeriesPB creates prompbmarshal.TimeSeries with given
|
||||
// values, timestamps and labels.
|
||||
// It expects that labels are already sorted.
|
||||
func newTimeSeriesPB(values []float64, timestamps []int64, labels []prompbmarshal.Label) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{
|
||||
Samples: make([]prompbmarshal.Sample, len(values)),
|
||||
}
|
||||
for i := range values {
|
||||
ts.Samples[i] = prompbmarshal.Sample{
|
||||
Value: values[i],
|
||||
Timestamp: time.Unix(timestamps[i], 0).UnixNano() / 1e6,
|
||||
}
|
||||
}
|
||||
ts.Labels = labels
|
||||
return ts
|
||||
}
|
||||
|
||||
@@ -1,18 +1,60 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
)
|
||||
|
||||
// AuthConfigOptions options which helps build promauth.Config
|
||||
type AuthConfigOptions func(config *promauth.HTTPClientConfig)
|
||||
|
||||
// AuthConfig returns promauth.Config based on the given params
|
||||
func AuthConfig(baUser, baPass, baFile, bearerToken, bearerTokenFile string) (*promauth.Config, error) {
|
||||
var baCfg *promauth.BasicAuthConfig
|
||||
if baUser != "" || baPass != "" || baFile != "" {
|
||||
baCfg = &promauth.BasicAuthConfig{
|
||||
Username: baUser,
|
||||
Password: promauth.NewSecret(baPass),
|
||||
PasswordFile: baFile,
|
||||
func AuthConfig(filterOptions ...AuthConfigOptions) (*promauth.Config, error) {
|
||||
authCfg := &promauth.HTTPClientConfig{}
|
||||
for _, option := range filterOptions {
|
||||
option(authCfg)
|
||||
}
|
||||
|
||||
return authCfg.NewConfig(".")
|
||||
}
|
||||
|
||||
// WithBasicAuth returns AuthConfigOptions and initialized promauth.BasicAuthConfig based on given params
|
||||
func WithBasicAuth(username, password, passwordFile string) AuthConfigOptions {
|
||||
return func(config *promauth.HTTPClientConfig) {
|
||||
if username != "" || password != "" || passwordFile != "" {
|
||||
config.BasicAuth = &promauth.BasicAuthConfig{
|
||||
Username: username,
|
||||
Password: promauth.NewSecret(password),
|
||||
PasswordFile: passwordFile,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WithBearer returns AuthConfigOptions and set BearerToken or BearerTokenFile based on given params
|
||||
func WithBearer(token, tokenFile string) AuthConfigOptions {
|
||||
return func(config *promauth.HTTPClientConfig) {
|
||||
if token != "" {
|
||||
config.BearerToken = promauth.NewSecret(token)
|
||||
}
|
||||
if tokenFile != "" {
|
||||
config.BearerTokenFile = tokenFile
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WithOAuth returns AuthConfigOptions and set OAuth params based on given params
|
||||
func WithOAuth(clientID, clientSecret, clientSecretFile, tokenURL, scopes string) AuthConfigOptions {
|
||||
return func(config *promauth.HTTPClientConfig) {
|
||||
if clientSecretFile != "" || clientSecret != "" {
|
||||
config.OAuth2 = &promauth.OAuth2Config{
|
||||
ClientID: clientID,
|
||||
ClientSecret: promauth.NewSecret(clientSecret),
|
||||
ClientSecretFile: clientSecretFile,
|
||||
TokenURL: tokenURL,
|
||||
Scopes: strings.Split(scopes, ";"),
|
||||
}
|
||||
}
|
||||
}
|
||||
return promauth.NewConfig(".", nil, baCfg, bearerToken, bearerTokenFile, nil, nil)
|
||||
}
|
||||
|
||||
54
app/vmalert/utils/metrics.go
Normal file
54
app/vmalert/utils/metrics.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package utils
|
||||
|
||||
import "github.com/VictoriaMetrics/metrics"
|
||||
|
||||
type namedMetric struct {
|
||||
Name string
|
||||
}
|
||||
|
||||
// Unregister removes the metric by name from default registry
|
||||
func (nm namedMetric) Unregister() {
|
||||
metrics.UnregisterMetric(nm.Name)
|
||||
}
|
||||
|
||||
// Gauge is a metrics.Gauge with Name
|
||||
type Gauge struct {
|
||||
namedMetric
|
||||
*metrics.Gauge
|
||||
}
|
||||
|
||||
// GetOrCreateGauge creates a new Gauge with the given name
|
||||
func GetOrCreateGauge(name string, f func() float64) *Gauge {
|
||||
return &Gauge{
|
||||
namedMetric: namedMetric{Name: name},
|
||||
Gauge: metrics.GetOrCreateGauge(name, f),
|
||||
}
|
||||
}
|
||||
|
||||
// Counter is a metrics.Counter with Name
|
||||
type Counter struct {
|
||||
namedMetric
|
||||
*metrics.Counter
|
||||
}
|
||||
|
||||
// GetOrCreateCounter creates a new Counter with the given name
|
||||
func GetOrCreateCounter(name string) *Counter {
|
||||
return &Counter{
|
||||
namedMetric: namedMetric{Name: name},
|
||||
Counter: metrics.GetOrCreateCounter(name),
|
||||
}
|
||||
}
|
||||
|
||||
// Summary is a metrics.Summary with Name
|
||||
type Summary struct {
|
||||
namedMetric
|
||||
*metrics.Summary
|
||||
}
|
||||
|
||||
// GetOrCreateSummary creates a new Summary with the given name
|
||||
func GetOrCreateSummary(name string) *Summary {
|
||||
return &Summary{
|
||||
namedMetric: namedMetric{Name: name},
|
||||
Summary: metrics.GetOrCreateSummary(name),
|
||||
}
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// PromDuration is Prometheus duration.
|
||||
type PromDuration struct {
|
||||
milliseconds int64
|
||||
}
|
||||
|
||||
// NewPromDuration returns PromDuration for given d.
|
||||
func NewPromDuration(d time.Duration) PromDuration {
|
||||
return PromDuration{
|
||||
milliseconds: d.Milliseconds(),
|
||||
}
|
||||
}
|
||||
|
||||
// MarshalYAML implements yaml.Marshaler interface.
|
||||
func (pd PromDuration) MarshalYAML() (interface{}, error) {
|
||||
return pd.Duration().String(), nil
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements yaml.Unmarshaler interface.
|
||||
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
var s string
|
||||
if err := unmarshal(&s); err != nil {
|
||||
return err
|
||||
}
|
||||
ms, err := metricsql.DurationValue(s, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pd.milliseconds = ms
|
||||
return nil
|
||||
}
|
||||
|
||||
// Duration returns duration for pd.
|
||||
func (pd *PromDuration) Duration() time.Duration {
|
||||
return time.Duration(pd.milliseconds) * time.Millisecond
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -24,8 +25,11 @@ var (
|
||||
|
||||
func initLinks() {
|
||||
pathPrefix := httpserver.GetPathPrefix()
|
||||
if pathPrefix == "" {
|
||||
pathPrefix = "/"
|
||||
}
|
||||
apiLinks = [][2]string{
|
||||
{path.Join(pathPrefix, "api/v1/groups"), "list all loaded groups and rules"},
|
||||
{path.Join(pathPrefix, "api/v1/rules"), "list all loaded groups and rules"},
|
||||
{path.Join(pathPrefix, "api/v1/alerts"), "list all active alerts"},
|
||||
{path.Join(pathPrefix, "api/v1/groupID/alertID/status"), "get alert status by ID"},
|
||||
{path.Join(pathPrefix, "flags"), "command-line flags"},
|
||||
@@ -33,9 +37,10 @@ func initLinks() {
|
||||
{path.Join(pathPrefix, "-/reload"), "reload configuration"},
|
||||
}
|
||||
navItems = []tpl.NavItem{
|
||||
{Name: "vmalert", Url: pathPrefix},
|
||||
{Name: "vmalert", Url: path.Join(pathPrefix, "/")},
|
||||
{Name: "Groups", Url: path.Join(pathPrefix, "groups")},
|
||||
{Name: "Alerts", Url: path.Join(pathPrefix, "alerts")},
|
||||
{Name: "Notifiers", Url: path.Join(pathPrefix, "notifiers")},
|
||||
{Name: "Docs", Url: "https://docs.victoriametrics.com/vmalert.html"},
|
||||
}
|
||||
}
|
||||
@@ -49,6 +54,11 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
initLinks()
|
||||
})
|
||||
|
||||
pathPrefix := httpserver.GetPathPrefix()
|
||||
if pathPrefix == "" {
|
||||
pathPrefix = "/"
|
||||
}
|
||||
|
||||
switch r.URL.Path {
|
||||
case "/":
|
||||
if r.Method != "GET" {
|
||||
@@ -57,12 +67,15 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
WriteWelcome(w)
|
||||
return true
|
||||
case "/alerts":
|
||||
WriteListAlerts(w, rh.groupAlerts())
|
||||
WriteListAlerts(w, pathPrefix, rh.groupAlerts())
|
||||
return true
|
||||
case "/groups":
|
||||
WriteListGroups(w, rh.groups())
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
case "/notifiers":
|
||||
WriteListTargets(w, notifier.GetTargets())
|
||||
return true
|
||||
case "/api/v1/rules":
|
||||
data, err := rh.listGroups()
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
@@ -108,16 +121,16 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
|
||||
// <groupID>/<alertID>/status
|
||||
WriteAlert(w, alert)
|
||||
WriteAlert(w, pathPrefix, alert)
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
type listGroupsResponse struct {
|
||||
Data struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) groups() []APIGroup {
|
||||
@@ -136,6 +149,7 @@ func (rh *requestHandler) groups() []APIGroup {
|
||||
|
||||
return groups
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
lr := listGroupsResponse{Status: "success"}
|
||||
lr.Data.Groups = rh.groups()
|
||||
@@ -150,10 +164,10 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
|
||||
}
|
||||
|
||||
type listAlertsResponse struct {
|
||||
Data struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
} `json:"data"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) groupAlerts() []GroupAlerts {
|
||||
@@ -168,7 +182,7 @@ func (rh *requestHandler) groupAlerts() []GroupAlerts {
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, a.AlertsAPI()...)
|
||||
alerts = append(alerts, a.AlertsToAPI()...)
|
||||
}
|
||||
if len(alerts) > 0 {
|
||||
groupAlerts = append(groupAlerts, GroupAlerts{
|
||||
@@ -191,7 +205,7 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsToAPI()...)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,8 +3,10 @@
|
||||
{% import (
|
||||
"time"
|
||||
"sort"
|
||||
"path"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
) %}
|
||||
|
||||
|
||||
@@ -29,14 +31,7 @@
|
||||
rOk := make(map[string]int)
|
||||
rNotOk := make(map[string]int)
|
||||
for _, g := range groups {
|
||||
for _, r := range g.AlertingRules{
|
||||
if r.LastError != "" {
|
||||
rNotOk[g.Name]++
|
||||
} else {
|
||||
rOk[g.Name]++
|
||||
}
|
||||
}
|
||||
for _, r := range g.RecordingRules{
|
||||
for _, r := range g.Rules {
|
||||
if r.LastError != "" {
|
||||
rNotOk[g.Name]++
|
||||
} else {
|
||||
@@ -50,7 +45,7 @@
|
||||
{% for _, g := range groups %}
|
||||
<div class="group-heading{% if rNotOk[g.Name] > 0 %} alert-danger{% endif %}" data-bs-target="rules-{%s g.ID %}">
|
||||
<span class="anchor" id="group-{%s g.ID %}"></span>
|
||||
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%s g.Interval %})</a>
|
||||
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%f.0 g.Interval %}s)</a>
|
||||
{% if rNotOk[g.Name] > 0 %}<span class="badge bg-danger" title="Number of rules with status Error">{%d rNotOk[g.Name] %}</span> {% endif %}
|
||||
<span class="badge bg-success" title="Number of rules withs status Ok">{%d rOk[g.Name] %}</span>
|
||||
<p class="fs-6 fw-lighter">{%s g.File %}</p>
|
||||
@@ -73,34 +68,24 @@
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for _, ar := range g.AlertingRules %}
|
||||
<tr{% if ar.LastError != "" %} class="alert-danger"{% endif %}>
|
||||
{% for _, r := range g.Rules %}
|
||||
<tr{% if r.LastError != "" %} class="alert-danger"{% endif %}>
|
||||
<td>
|
||||
<b>alert:</b> {%s ar.Name %} (for: {%v ar.For %})<br>
|
||||
<code><pre>{%s ar.Expression %}</pre></code><br>
|
||||
{% if len(ar.Labels) > 0 %} <b>Labels:</b>{% endif %}
|
||||
{% for k, v := range ar.Labels %}
|
||||
{% if r.Type == "alerting" %}
|
||||
<b>alert:</b> (for: {%v r.Duration %})
|
||||
{% else %}
|
||||
<b>record:</b> {%s r.Name %}
|
||||
{% endif %}
|
||||
<br>
|
||||
<code><pre>{%s r.Query %}</pre></code><br>
|
||||
{% if len(r.Labels) > 0 %} <b>Labels:</b>{% endif %}
|
||||
{% for k, v := range r.Labels %}
|
||||
<span class="ms-1 badge bg-primary">{%s k %}={%s v %}</span>
|
||||
{% endfor %}
|
||||
</td>
|
||||
<td><div class="error-cell">{%s ar.LastError %}</div></td>
|
||||
<td>{%d ar.LastSamples %}</td>
|
||||
<td>{%f.3 time.Since(ar.LastExec).Seconds() %}s ago</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% for _, rr := range g.RecordingRules %}
|
||||
<tr>
|
||||
<td>
|
||||
<b>record:</b> {%s rr.Name %}<br>
|
||||
<code><pre>{%s rr.Expression %}</pre></code>
|
||||
{% if len(rr.Labels) > 0 %} <b>Labels:</b>{% endif %}
|
||||
{% for k, v := range rr.Labels %}
|
||||
<span class="ms-1 badge bg-primary">{%s k %}={%s v %}</span>
|
||||
{% endfor %}
|
||||
</td>
|
||||
<td><div class="error-cell">{%s rr.LastError %}</div></td>
|
||||
<td>{%d rr.LastSamples %}</td>
|
||||
<td>{%f.3 time.Since(rr.LastExec).Seconds() %}s ago</td>
|
||||
<td><div class="error-cell">{%s r.LastError %}</div></td>
|
||||
<td>{%d r.LastSamples %}</td>
|
||||
<td>{%f.3 time.Since(r.LastEvaluation).Seconds() %}s ago</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
@@ -119,7 +104,7 @@
|
||||
{% endfunc %}
|
||||
|
||||
|
||||
{% func ListAlerts(groupAlerts []GroupAlerts) %}
|
||||
{% func ListAlerts(pathPrefix string, groupAlerts []GroupAlerts) %}
|
||||
{%= tpl.Header("Alerts", navItems) %}
|
||||
{% if len(groupAlerts) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
@@ -184,7 +169,7 @@
|
||||
</td>
|
||||
<td>{%s ar.Value %}</td>
|
||||
<td>
|
||||
<a href="/{%s g.ID %}/{%s ar.ID %}/status">Details</a>
|
||||
<a href="{%s path.Join(pathPrefix, g.ID, ar.ID, "status") %}">Details</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
@@ -205,7 +190,63 @@
|
||||
|
||||
{% endfunc %}
|
||||
|
||||
{% func Alert(alert *APIAlert) %}
|
||||
{% func ListTargets(targets map[notifier.TargetType][]notifier.Target) %}
|
||||
{%= tpl.Header("Notifiers", navItems) %}
|
||||
{% if len(targets) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||
|
||||
{%code
|
||||
var keys []string
|
||||
for key := range targets {
|
||||
keys = append(keys, string(key))
|
||||
}
|
||||
sort.Strings(keys)
|
||||
%}
|
||||
|
||||
{% for i := range keys %}
|
||||
{%code typeK, ns := keys[i], targets[notifier.TargetType(keys[i])]
|
||||
count := len(ns)
|
||||
%}
|
||||
<div class="group-heading data-bs-target="rules-{%s typeK %}">
|
||||
<span class="anchor" id="notifiers-{%s typeK %}"></span>
|
||||
<a href="#notifiers-{%s typeK %}">{%s typeK %} ({%d count %})</a>
|
||||
</div>
|
||||
<div class="collapse show" id="notifiers-{%s typeK %}">
|
||||
<table class="table table-striped table-hover table-sm">
|
||||
<thead>
|
||||
<tr>
|
||||
<th scope="col">Labels</th>
|
||||
<th scope="col">Address</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for _, n := range ns %}
|
||||
<tr>
|
||||
<td>
|
||||
{% for _, l := range n.Labels %}
|
||||
<span class="ms-1 badge bg-primary">{%s l.Name %}={%s l.Value %}</span>
|
||||
{% endfor %}
|
||||
</td>
|
||||
<td>{%s n.Notifier.Addr() %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
{% else %}
|
||||
<div>
|
||||
<p>No items...</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{%= tpl.Footer() %}
|
||||
|
||||
{% endfunc %}
|
||||
|
||||
{% func Alert(pathPrefix string, alert *APIAlert) %}
|
||||
{%= tpl.Header("", navItems) %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
@@ -272,7 +313,7 @@
|
||||
Group
|
||||
</div>
|
||||
<div class="col">
|
||||
<a target="_blank" href="/groups#group-{%s alert.GroupID %}">{%s alert.GroupID %}</a>
|
||||
<a target="_blank" href="{%s path.Join(pathPrefix,"groups") %}#group-{%s alert.GroupID %}">{%s alert.GroupID %}</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,7 +14,7 @@ func TestHandler(t *testing.T) {
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {},
|
||||
0: {State: notifier.StateFiring},
|
||||
},
|
||||
}
|
||||
g := &Group{
|
||||
@@ -54,9 +54,9 @@ func TestHandler(t *testing.T) {
|
||||
t.Errorf("expected 1 alert got %d", length)
|
||||
}
|
||||
})
|
||||
t.Run("/api/v1/groups", func(t *testing.T) {
|
||||
t.Run("/api/v1/rules", func(t *testing.T) {
|
||||
lr := listGroupsResponse{}
|
||||
getResp(ts.URL+"/api/v1/groups", &lr, 200)
|
||||
getResp(ts.URL+"/api/v1/rules", &lr, 200)
|
||||
if length := len(lr.Data.Groups); length != 1 {
|
||||
t.Errorf("expected 1 group got %d", length)
|
||||
}
|
||||
|
||||
@@ -4,63 +4,61 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents an notifier.AlertingRule state
|
||||
// APIAlert represents a notifier.AlertingRule state
|
||||
// for WEB view
|
||||
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIAlert struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
RuleID string `json:"rule_id"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
State string `json:"state"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
ActiveAt time.Time `json:"activeAt"`
|
||||
SourceLink string `json:"source"`
|
||||
Restored bool `json:"restored"`
|
||||
|
||||
// Additional fields
|
||||
|
||||
// ID is an unique Alert's ID within a group
|
||||
ID string `json:"id"`
|
||||
// RuleID is an unique Rule's ID within a group
|
||||
RuleID string `json:"rule_id"`
|
||||
// GroupID is an unique Group's ID
|
||||
GroupID string `json:"group_id"`
|
||||
// Expression contains the PromQL/MetricsQL expression
|
||||
// for Rule's evaluation
|
||||
Expression string `json:"expression"`
|
||||
// SourceLink contains a link to a system which should show
|
||||
// why Alert was generated
|
||||
SourceLink string `json:"source"`
|
||||
// Restored shows whether Alert's state was restored on restart
|
||||
Restored bool `json:"restored"`
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
Params []string `json:"params"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
// Name is the group name as present in the config
|
||||
Name string `json:"name"`
|
||||
// Rules contains both recording and alerting rules
|
||||
Rules []APIRule `json:"rules"`
|
||||
// Interval is the Group's evaluation interval in float seconds as present in the file.
|
||||
Interval float64 `json:"interval"`
|
||||
// LastEvaluation is the timestamp of the last time the Group was executed
|
||||
LastEvaluation time.Time `json:"lastEvaluation"`
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
type APIAlertingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
For string `json:"for"`
|
||||
LastError string `json:"last_error"`
|
||||
LastSamples int `json:"last_samples"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
}
|
||||
// Additional fields
|
||||
|
||||
// APIRecordingRule represents RecordingRule for WEB view
|
||||
type APIRecordingRule struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
GroupID string `json:"group_id"`
|
||||
Expression string `json:"expression"`
|
||||
LastError string `json:"last_error"`
|
||||
LastSamples int `json:"last_samples"`
|
||||
LastExec time.Time `json:"last_exec"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
// Type shows the datasource type (prometheus or graphite) of the Group
|
||||
Type string `json:"type"`
|
||||
// ID is a unique Group ID
|
||||
ID string `json:"id"`
|
||||
// File contains a path to the file with Group's config
|
||||
File string `json:"file"`
|
||||
// Concurrency shows how many rules may be evaluated simultaneously
|
||||
Concurrency int `json:"concurrency"`
|
||||
// Params contains HTTP URL parameters added to each Rule's request
|
||||
Params []string `json:"params,omitempty"`
|
||||
// Labels is a set of label value pairs, that will be added to every rule.
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
}
|
||||
|
||||
// GroupAlerts represents a group of alerts for WEB view
|
||||
@@ -68,3 +66,43 @@ type GroupAlerts struct {
|
||||
Group APIGroup
|
||||
Alerts []*APIAlert
|
||||
}
|
||||
|
||||
// APIRule represents a Rule for WEB view
|
||||
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIRule struct {
|
||||
// State must be one of these under following scenarios
|
||||
// "pending": at least 1 alert in the rule in pending state and no other alert in firing state.
|
||||
// "firing": at least 1 alert in the rule in firing state.
|
||||
// "inactive": no alert in the rule in firing or pending state.
|
||||
State string `json:"state"`
|
||||
Name string `json:"name"`
|
||||
// Query represents Rule's `expression` field
|
||||
Query string `json:"query"`
|
||||
// Duration represents Rule's `for` field
|
||||
Duration float64 `json:"duration"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations,omitempty"`
|
||||
// LastError contains the error faced while executing the rule.
|
||||
LastError string `json:"lastError"`
|
||||
// EvaluationTime is the time taken to completely evaluate the rule in float seconds.
|
||||
EvaluationTime float64 `json:"evaluationTime"`
|
||||
// LastEvaluation is the timestamp of the last time the rule was executed
|
||||
LastEvaluation time.Time `json:"lastEvaluation"`
|
||||
// Alerts is the list of all the alerts in this rule that are currently pending or firing
|
||||
Alerts []*APIAlert `json:"alerts,omitempty"`
|
||||
// Health is the health of rule evaluation.
|
||||
// It MUST be one of "ok", "err", "unknown"
|
||||
Health string `json:"health"`
|
||||
// Type of the rule: recording or alerting
|
||||
Type string `json:"type"`
|
||||
|
||||
// Additional fields
|
||||
|
||||
// Type of the rule: recording or alerting
|
||||
DatasourceType string `json:"datasourceType"`
|
||||
LastSamples int `json:"lastSamples"`
|
||||
// ID is an unique Alert's ID within a group
|
||||
ID string `json:"id"`
|
||||
// GroupID is an unique Group's ID
|
||||
GroupID string `json:"group_id"`
|
||||
}
|
||||
|
||||
@@ -27,6 +27,15 @@ vmauth-ppc64le-prod:
|
||||
vmauth-386-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-386
|
||||
|
||||
vmauth-darwin-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmauth-darwin-arm64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmauth-windows-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmauth:
|
||||
APP_NAME=vmauth $(MAKE) package-via-docker
|
||||
|
||||
@@ -80,6 +89,3 @@ vmauth-pure:
|
||||
|
||||
vmauth-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmauth $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmauth-windows-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# vmauth
|
||||
|
||||
`vmauth` is a simple auth proxy, router and [load balancer](#load-balancing) for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It reads auth credentials from `Authorization` http header ([Basic Auth](https://en.wikipedia.org/wiki/Basic_access_authentication) and `Bearer token` is supported),
|
||||
It reads auth credentials from `Authorization` http header ([Basic Auth](https://en.wikipedia.org/wiki/Basic_access_authentication), `Bearer token` and [InfluxDB authorization](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1897) is supported),
|
||||
matches them against configs pointed by [-auth.config](#auth-config) command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
The `-auth.config` can point to either local file or to http url.
|
||||
|
||||
@@ -10,7 +10,7 @@ The `-auth.config` can point to either local file or to http url.
|
||||
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
|
||||
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
|
||||
|
||||
```
|
||||
```bash
|
||||
/path/to/vmauth -auth.config=/path/to/auth/config.yml
|
||||
```
|
||||
|
||||
@@ -39,7 +39,8 @@ Each `url_prefix` in the [-auth.config](#auth-config) may contain either a singl
|
||||
# Username and bearer_token values must be unique.
|
||||
|
||||
users:
|
||||
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
|
||||
# Requests with the 'Authorization: Bearer XXXX' and 'Authorization: Token XXXX'
|
||||
# header are proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
# Requests with the Basic Auth username=XXXX are proxied to http://localhost:8428 as well.
|
||||
- bearer_token: "XXXX"
|
||||
@@ -124,15 +125,17 @@ This may be useful for passing secrets to the config.
|
||||
|
||||
## Security
|
||||
|
||||
It is expected that all the backend services protected by `vmauth` are located in an isolated private network, so they can be accessed by external users only via `vmauth`.
|
||||
|
||||
Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable https. This can be done by passing the following `-tls*` command-line flags to `vmauth`:
|
||||
|
||||
```
|
||||
```bash
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
```
|
||||
|
||||
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
|
||||
@@ -159,7 +162,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmauth` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmauth` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -186,18 +189,26 @@ ROOT_IMAGE=scratch make package-vmauth
|
||||
|
||||
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
* Memory profile. It can be collected with the following command (replace `0.0.0.0` with hostname if needed):
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/heap > mem.pprof
|
||||
curl http://0.0.0.0:8427/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
</div>
|
||||
|
||||
* CPU profile. It can be collected with the following command (replace `0.0.0.0` with hostname if needed):
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```bash
|
||||
curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
|
||||
curl http://0.0.0.0:8427/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
The command for collecting CPU profile waits for 30 seconds before returning.
|
||||
|
||||
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
|
||||
@@ -206,7 +217,7 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
|
||||
|
||||
Pass `-help` command-line arg to `vmauth` in order to see all the configuration options:
|
||||
|
||||
```
|
||||
```bash
|
||||
./vmauth -help
|
||||
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
@@ -214,68 +225,70 @@ vmauth authenticates and authorizes incoming requests and proxies them to Victor
|
||||
See the docs at https://docs.victoriametrics.com/vmauth.html .
|
||||
|
||||
-auth.config string
|
||||
Path to auth config. It can point either to local file or to http url. See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config
|
||||
Path to auth config. It can point either to local file or to http url. See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address to listen for http connections (default ":8427")
|
||||
TCP address to listen for http connections (default ":8427")
|
||||
-logInvalidAuthTokens
|
||||
Whether to log requests with invalid auth tokens. Such requests are always counted at vmauth_http_request_errors_total{reason="invalid_auth_token"} metric, which is exposed at /metrics page
|
||||
Whether to log requests with invalid auth tokens. Such requests are always counted at vmauth_http_request_errors_total{reason="invalid_auth_token"} metric, which is exposed at /metrics page
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxIdleConnsPerBackend int
|
||||
The maximum number of idle connections vmauth can open per each backend host (default 100)
|
||||
The maximum number of idle connections vmauth can open per each backend host (default 100)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-reloadAuthKey string
|
||||
Auth key for /-/reload http endpoint. It must be passed as authKey=...
|
||||
Auth key for /-/reload http endpoint. It must be passed as authKey=...
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower. The provided certificate file is automatically re-read every second, so it can be dynamically updated
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
# Username and bearer_token values must be unique.
|
||||
|
||||
users:
|
||||
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
|
||||
# Requests with the 'Authorization: Bearer XXXX' and 'Authorization: Token XXXX'
|
||||
# header are proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
# Requests with the Basic Auth username=XXXX are proxied to http://localhost:8428 as well.
|
||||
- bearer_token: "XXXX"
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -72,6 +73,11 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
http.Error(w, "missing `Authorization` request header", http.StatusUnauthorized)
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(authToken, "Token ") {
|
||||
// Handle InfluxDB's proprietary token authentication scheme as a bearer token authentication
|
||||
// See https://docs.influxdata.com/influxdb/v2.0/api/
|
||||
authToken = strings.Replace(authToken, "Token", "Bearer", 1)
|
||||
}
|
||||
ac := authConfig.Load().(map[string]*UserInfo)
|
||||
ui := ac[authToken]
|
||||
if ui == nil {
|
||||
|
||||
@@ -27,6 +27,12 @@ vmbackup-ppc64le-prod:
|
||||
vmbackup-386-prod:
|
||||
APP_NAME=vmbackup $(MAKE) app-via-docker-386
|
||||
|
||||
vmbackup-darwin-amd64-prod:
|
||||
APP_NAME=vmbackup $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmbackup-darwin-arm64-prod:
|
||||
APP_NAME=vmbackup $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
package-vmbackup:
|
||||
APP_NAME=vmbackup $(MAKE) package-via-docker
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ Supported storage systems for backups:
|
||||
|
||||
* [GCS](https://cloud.google.com/storage/). Example: `gs://<bucket>/<path/to/backup>`
|
||||
* [S3](https://aws.amazon.com/s3/). Example: `s3://<bucket>/<path/to/backup>`
|
||||
* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio), [Ceph](https://docs.ceph.com/docs/mimic/radosgw/s3/) or [Swift](https://www.swiftstack.com/docs/admin/middleware/s3_middleware.html). See [these docs](#advanced-usage) for details.
|
||||
* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio), [Ceph](https://docs.ceph.com/en/pacific/radosgw/s3/) or [Swift](https://platform.swiftstack.com/docs/admin/middleware/s3_middleware.html). See [these docs](#advanced-usage) for details.
|
||||
* Local filesystem. Example: `fs://</absolute/path/to/backup>`
|
||||
|
||||
`vmbackup` supports incremental and full backups. Incremental backups are created automatically if the destination path already contains data from the previous backup.
|
||||
@@ -22,14 +22,13 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
See also [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html) tool built on top of `vmbackup`. This tool simplifies
|
||||
creation of hourly, daily, weekly and monthly backups.
|
||||
|
||||
|
||||
## Use cases
|
||||
|
||||
### Regular backups
|
||||
|
||||
Regular backup can be performed with the following command:
|
||||
|
||||
```
|
||||
```bash
|
||||
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gs://<bucket>/<path/to/new/backup>
|
||||
```
|
||||
|
||||
@@ -39,36 +38,33 @@ vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-
|
||||
* `<bucket>` is an already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
|
||||
* `<path/to/new/backup>` is the destination path where new backup will be placed.
|
||||
|
||||
|
||||
### Regular backups with server-side copy from existing backup
|
||||
|
||||
If the destination GCS bucket already contains the previous backup at `-origin` path, then new backup can be sped up
|
||||
with the following command:
|
||||
|
||||
```
|
||||
```bash
|
||||
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gs://<bucket>/<path/to/new/backup> -origin=gs://<bucket>/<path/to/existing/backup>
|
||||
```
|
||||
|
||||
It saves time and network bandwidth costs by performing server-side copy for the shared data from the `-origin` to `-dst`.
|
||||
|
||||
|
||||
### Incremental backups
|
||||
|
||||
Incremental backups are performed if `-dst` points to an already existing backup. In this case only new data is uploaded to remote storage.
|
||||
It saves time and network bandwidth costs when working with big backups:
|
||||
|
||||
```
|
||||
```bash
|
||||
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gs://<bucket>/<path/to/existing/backup>
|
||||
```
|
||||
|
||||
|
||||
### Smart backups
|
||||
|
||||
Smart backups mean storing full daily backups into `YYYYMMDD` folders and creating incremental hourly backup into `latest` folder:
|
||||
|
||||
* Run the following command every hour:
|
||||
|
||||
```
|
||||
```bash
|
||||
vmbackup -snapshotName=<latest-snapshot> -dst=gs://<bucket>/latest
|
||||
```
|
||||
|
||||
@@ -77,13 +73,12 @@ The command will upload only changed data to `gs://<bucket>/latest`.
|
||||
|
||||
* Run the following command once a day:
|
||||
|
||||
```
|
||||
```bash
|
||||
vmbackup -snapshotName=<daily-snapshot> -dst=gs://<bucket>/<YYYYMMDD> -origin=gs://<bucket>/latest
|
||||
```
|
||||
|
||||
Where `<daily-snapshot>` is the snapshot for the last day `<YYYYMMDD>`.
|
||||
|
||||
|
||||
This apporach saves network bandwidth costs on hourly backups (since they are incremental) and allows recovering data from either the last hour (`latest` backup)
|
||||
or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when creating daily backup.
|
||||
|
||||
@@ -91,7 +86,6 @@ Do not forget to remove old snapshots and backups when they are no longer needed
|
||||
|
||||
See also [vmbackupmanager tool](https://docs.victoriametrics.com/vmbackupmanager.html) for automating smart backups.
|
||||
|
||||
|
||||
## How does it work?
|
||||
|
||||
The backup algorithm is the following:
|
||||
@@ -108,16 +102,15 @@ Such splitting minimizes the amounts of data to re-transfer after temporary erro
|
||||
|
||||
`vmbackup` relies on [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282) properties:
|
||||
|
||||
- All the files in the snapshot are immutable.
|
||||
- Old files are periodically merged into new files.
|
||||
- Smaller files have higher probability to be merged.
|
||||
- Consecutive snapshots share many identical files.
|
||||
* All the files in the snapshot are immutable.
|
||||
* Old files are periodically merged into new files.
|
||||
* Smaller files have higher probability to be merged.
|
||||
* Consecutive snapshots share many identical files.
|
||||
|
||||
These properties allow performing fast and cheap incremental backups and server-side copying from `-origin` paths.
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
`vmbackup` can work improperly or slowly when these properties are violated.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
@@ -126,15 +119,14 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* Backups created from [single-node VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) and vice versa.
|
||||
|
||||
|
||||
## Advanced usage
|
||||
|
||||
|
||||
* Obtaining credentials from a file.
|
||||
|
||||
Add flag `-credsFilePath=/etc/credentials` with the following content:
|
||||
|
||||
for s3 (aws, minio or other s3 compatible storages):
|
||||
|
||||
```bash
|
||||
[default]
|
||||
aws_access_key_id=theaccesskey
|
||||
@@ -142,6 +134,7 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
```
|
||||
|
||||
for gce cloud storage:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "service_account",
|
||||
@@ -159,7 +152,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
|
||||
* Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc.
|
||||
You have to add a custom url endpoint via flag:
|
||||
```
|
||||
|
||||
```bash
|
||||
# for minio
|
||||
-customS3Endpoint=http://localhost:9000
|
||||
|
||||
@@ -169,105 +163,103 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
|
||||
* Run `vmbackup -help` in order to see all the available options:
|
||||
|
||||
```
|
||||
```bash
|
||||
-concurrency int
|
||||
The number of concurrent workers. Higher concurrency may reduce backup duration (default 10)
|
||||
The number of concurrent workers. Higher concurrency may reduce backup duration (default 10)
|
||||
-configFilePath string
|
||||
Path to file with S3 configs. Configs are loaded from default location if not set.
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
Path to file with S3 configs. Configs are loaded from default location if not set.
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
-configProfile string
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
|
||||
-credsFilePath string
|
||||
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
-customS3Endpoint string
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
|
||||
-dst string
|
||||
Where to put the backup on the remote storage. Example: gs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
|
||||
Where to put the backup on the remote storage. Example: gs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address for exporting metrics at /metrics page (default ":8420")
|
||||
TCP address for exporting metrics at /metrics page (default ":8420")
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxBytesPerSecond size
|
||||
The maximum upload speed. There is no limit if it is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
The maximum upload speed. There is no limit if it is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-origin string
|
||||
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
|
||||
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-s3ForcePathStyle
|
||||
Prefixing endpoint with bucket name when set false, true by default. (default true)
|
||||
Prefixing endpoint with bucket name when set false, true by default. (default true)
|
||||
-snapshot.createURL string
|
||||
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
-snapshot.deleteURL string
|
||||
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete
|
||||
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete
|
||||
-snapshotName string
|
||||
Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
-storageDataPath string
|
||||
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
|
||||
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
|
||||
## How to build from sources
|
||||
|
||||
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
|
||||
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmbackup` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmbackup` binary and puts it into the `bin` folder.
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
## vmbackupmanager
|
||||
|
||||
***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/enterprise.html). It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)***
|
||||
***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/products/enterprise/). It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)***
|
||||
|
||||
The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed.
|
||||
The backup service makes a backup every hour and puts it to the latest folder and then copies data to the folders which represent the backup intervals (hourly, daily, weekly and monthly)
|
||||
@@ -9,11 +9,10 @@ The required flags for running the service are as follows:
|
||||
|
||||
* -eula - should be true and means that you have the legal right to run a backup manager. That can either be a signed contract or an email with confirmation to run the service in a trial period
|
||||
* -storageDataPath - path to VictoriaMetrics or vmstorage data path to make backup from
|
||||
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create
|
||||
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: <http://victoriametrics:8428/snapshot/create>
|
||||
* -dst - backup destination at s3, gcs or local filesystem
|
||||
* -credsFilePath - path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set. See [https://cloud.google.com/iam/docs/creating-managing-service-account-keys](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) and [https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html](https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html)
|
||||
|
||||
|
||||
Backup schedule is controlled by the following flags:
|
||||
|
||||
* -disableHourly - disable hourly run. Default false
|
||||
@@ -23,7 +22,6 @@ Backup schedule is controlled by the following flags:
|
||||
|
||||
By default, all flags are turned on and Backup Manager backups data every hour for every interval (hourly, daily, weekly and monthly).
|
||||
|
||||
|
||||
The backup manager creates the following directory hierarchy at **-dst**:
|
||||
|
||||
* /latest/ - contains the latest backup
|
||||
@@ -32,7 +30,6 @@ The backup manager creates the following directory hierarchy at **-dst**:
|
||||
* /weekly/ - contains weekly backups. Each backup is named as *YYYY-WW*
|
||||
* /monthly/ - contains monthly backups. Each backup is named as *YYYY-MM*
|
||||
|
||||
|
||||
To get the full list of supported flags please run the following command:
|
||||
|
||||
```console
|
||||
@@ -48,7 +45,6 @@ There are two flags which could help with performance tuning:
|
||||
* -maxBytesPerSecond - the maximum upload speed. There is no limit if it is set to 0
|
||||
* -concurrency - The number of concurrent workers. Higher concurrency may improve upload speed (default 10)
|
||||
|
||||
|
||||
## Example of Usage
|
||||
|
||||
GCS and cluster version. You need to have a credentials file in json format with following structure
|
||||
@@ -96,11 +92,11 @@ info VictoriaMetrics/lib/storage/storage.go:319 deleted snapshot "/vmstora
|
||||
|
||||
The result on the GCS bucket
|
||||
|
||||
- The root folder
|
||||
* The root folder
|
||||
|
||||

|
||||
|
||||
- The latest folder
|
||||
* The latest folder
|
||||
|
||||

|
||||
|
||||
@@ -119,7 +115,6 @@ Let’s assume we have a backup manager collecting daily backups for the past 10
|
||||
|
||||

|
||||
|
||||
|
||||
We enable backup retention policy for backup manager by using following configuration:
|
||||
|
||||
```console
|
||||
|
||||
@@ -27,6 +27,15 @@ vmctl-ppc64le-prod:
|
||||
vmctl-386-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-386
|
||||
|
||||
vmctl-darwin-amd64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmctl-darwin-arm64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmctl-windows-amd64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmctl:
|
||||
APP_NAME=vmctl $(MAKE) package-via-docker
|
||||
|
||||
@@ -75,5 +84,3 @@ vmctl-pure:
|
||||
vmctl-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmctl $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmctl-windows-amd64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -2,19 +2,20 @@
|
||||
|
||||
VictoriaMetrics command-line tool
|
||||
|
||||
Features:
|
||||
- [x] Prometheus: migrate data from Prometheus to VictoriaMetrics using snapshot API
|
||||
- [x] Thanos: migrate data from Thanos to VictoriaMetrics
|
||||
- [ ] ~~Prometheus: migrate data from Prometheus to VictoriaMetrics by query~~(discarded)
|
||||
- [x] InfluxDB: migrate data from InfluxDB to VictoriaMetrics
|
||||
- [x] OpenTSDB: migrate data from OpenTSDB to VictoriaMetrics
|
||||
- [ ] Storage Management: data re-balancing between nodes
|
||||
vmctl provides various useful actions with VictoriaMetrics components.
|
||||
|
||||
vmctl acts as a proxy between data source ([Prometheus](#migrating-data-from-prometheus),
|
||||
[InfluxDB](#migrating-data-from-influxdb-1x), [VictoriaMetrics](##migrating-data-from-victoriametrics), etc.)
|
||||
and destination - VictoriaMetrics single or cluster version. To see the full list of supported modes
|
||||
Features:
|
||||
- migrate data from [Prometheus](#migrating-data-from-prometheus) to VictoriaMetrics using snapshot API
|
||||
- migrate data from [Thanos](#migrating-data-from-thanos) to VictoriaMetrics
|
||||
- migrate data from [InfluxDB](#migrating-data-from-influxdb-1x) to VictoriaMetrics
|
||||
- migrate data from [OpenTSDB](#migrating-data-from-opentsdb) to VictoriaMetrics
|
||||
- migrate data between [VictoriaMetrics](#migrating-data-from-victoriametrics) single or cluster version.
|
||||
- [verify](#verifying-exported-blocks-from-victoriametrics) exported blocks from VictoriaMetrics single or cluster version.
|
||||
|
||||
To see the full list of supported modes
|
||||
run the following command:
|
||||
```
|
||||
|
||||
```bash
|
||||
./vmctl --help
|
||||
NAME:
|
||||
vmctl - VictoriaMetrics command-line tool
|
||||
@@ -27,10 +28,12 @@ COMMANDS:
|
||||
influx Migrate timeseries from InfluxDB
|
||||
prometheus Migrate timeseries from Prometheus
|
||||
vm-native Migrate time series between VictoriaMetrics installations via native binary format
|
||||
verify-block Verifies correctness of data blocks exported via VictoriaMetrics Native format. See https://docs.victoriametrics.com/#how-to-export-data-in-native-format
|
||||
```
|
||||
|
||||
Each mode has its own unique set of flags specific (e.g. prefixed with `influx` for influx mode)
|
||||
to the data source and common list of flags for destination (prefixed with `vm` for VictoriaMetrics):
|
||||
|
||||
```
|
||||
./vmctl influx --help
|
||||
OPTIONS:
|
||||
@@ -46,10 +49,11 @@ Please note, that vmctl performs initial readiness check for the given address b
|
||||
```
|
||||
|
||||
When doing a migration user needs to specify flags for source (where and how to fetch data) and for
|
||||
destination (where to migrate data). Every mode has additional details and nuances, please see
|
||||
destination (where to migrate data). Every mode has additional details and nuances, please see
|
||||
them below in corresponding sections.
|
||||
|
||||
For the destination flags see the full description by running the following command:
|
||||
|
||||
```
|
||||
./vmctl influx --help | grep vm-
|
||||
```
|
||||
@@ -59,14 +63,13 @@ has additional sections with description below. Details about tweaking and adjus
|
||||
are explained in [Tuning](#tuning) section.
|
||||
|
||||
Please note, that if you're going to import data into VictoriaMetrics cluster do not
|
||||
forget to specify the `--vm-account-id` flag. See more details for cluster version
|
||||
forget to specify the `--vm-account-id` flag. See more details for cluster version
|
||||
[here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
## Articles
|
||||
|
||||
* [How to migrate data from Prometheus](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043)
|
||||
* [How to migrate data from Prometheus. Filtering and modifying time series](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-filtering-and-modifying-time-series-6d40cea4bf21)
|
||||
|
||||
- [How to migrate data from Prometheus](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043)
|
||||
- [How to migrate data from Prometheus. Filtering and modifying time series](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-filtering-and-modifying-time-series-6d40cea4bf21)
|
||||
|
||||
## Migrating data from OpenTSDB
|
||||
|
||||
@@ -79,16 +82,21 @@ See `./vmctl opentsdb --help` for details and full list of flags.
|
||||
OpenTSDB migration works like so:
|
||||
|
||||
1. Find metrics based on selected filters (or the default filter set ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
|
||||
* e.g. `curl -Ss "http://opentsdb:4242/api/suggest?type=metrics&q=sys"`
|
||||
|
||||
- e.g. `curl -Ss "http://opentsdb:4242/api/suggest?type=metrics&q=sys"`
|
||||
|
||||
2. Find series associated with each returned metric
|
||||
* e.g. `curl -Ss "http://opentsdb:4242/api/search/lookup?m=system.load5&limit=1000000"`
|
||||
|
||||
- e.g. `curl -Ss "http://opentsdb:4242/api/search/lookup?m=system.load5&limit=1000000"`
|
||||
|
||||
3. Download data for each series in chunks defined in the CLI switches
|
||||
* e.g. `-retention=sum-1m-avg:1h:90d` ==
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* ...
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=2160h-ago&end=2159h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
|
||||
- e.g. `-retention=sum-1m-avg:1h:90d` ==
|
||||
- `curl -Ss "http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
- `curl -Ss "http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
- `curl -Ss "http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
- ...
|
||||
- `curl -Ss "http://opentsdb:4242/api/query?start=2160h-ago&end=2159h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
|
||||
This means that we must stream data from OpenTSDB to VictoriaMetrics in chunks. This is where concurrency for OpenTSDB comes in. We can query multiple chunks at once, but we shouldn't perform too many chunks at a time to avoid overloading the OpenTSDB cluster.
|
||||
|
||||
@@ -107,6 +115,7 @@ Found 9 metrics to import. Continue? [Y/n]
|
||||
Starting with a relatively simple retention string (`sum-1m-avg:1h:30d`), let's describe how this is converted into actual queries.
|
||||
|
||||
There are two essential parts of a retention string:
|
||||
|
||||
1. [aggregation](#aggregation)
|
||||
2. [windows/time ranges](#windows)
|
||||
|
||||
@@ -115,8 +124,9 @@ There are two essential parts of a retention string:
|
||||
Retention strings essentially define the two levels of aggregation for our collected series.
|
||||
|
||||
`sum-1m-avg` would become:
|
||||
* First order: `sum`
|
||||
* Second order: `1m-avg-none`
|
||||
|
||||
- First order: `sum`
|
||||
- Second order: `1m-avg-none`
|
||||
|
||||
##### First Order Aggregations
|
||||
|
||||
@@ -137,6 +147,7 @@ We do not allow for defining the "null value" portion of the rollup window (e.g.
|
||||
#### Windows
|
||||
|
||||
There are two important windows we define in a retention string:
|
||||
|
||||
1. the "chunk" range of each query
|
||||
2. The time range we will be querying on with that "chunk"
|
||||
|
||||
@@ -182,8 +193,8 @@ See `./vmctl influx --help` for details and full list of flags.
|
||||
|
||||
To use migration tool please specify the InfluxDB address `--influx-addr`, the database `--influx-database` and VictoriaMetrics address `--vm-addr`.
|
||||
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
As soon as required flags are provided and all endpoints are accessible, `vmctl` will start the InfluxDB scheme exploration.
|
||||
@@ -191,8 +202,9 @@ Basically, it just fetches all fields and timeseries from the provided database
|
||||
Then `vmctl` sends fetch requests for each timeseries to InfluxDB one by one and pass results to VM importer.
|
||||
VM importer then accumulates received samples in batches and sends import requests to VM.
|
||||
|
||||
The importing process example for local installation of InfluxDB(`http://localhost:8086`)
|
||||
The importing process example for local installation of InfluxDB(`http://localhost:8086`)
|
||||
and single-node VictoriaMetrics(`http://localhost:8428`):
|
||||
|
||||
```
|
||||
./vmctl influx --influx-database benchmark
|
||||
InfluxDB import mode
|
||||
@@ -212,25 +224,27 @@ Found 40000 timeseries to import. Continue? [Y/n] y
|
||||
bytes/s: 5.4 MB;
|
||||
import requests: 40001;
|
||||
2020/01/18 21:19:00 Total time: 31m48.467044016s
|
||||
```
|
||||
```
|
||||
|
||||
### Data mapping
|
||||
|
||||
Vmctl maps InfluxDB data the same way as VictoriaMetrics does by using the following rules:
|
||||
|
||||
* `influx-database` arg is mapped into `db` label value unless `db` tag exists in the InfluxDB line.
|
||||
* Field names are mapped to time series names prefixed with {measurement}{separator} value,
|
||||
where {separator} equals to _ by default.
|
||||
- `influx-database` arg is mapped into `db` label value unless `db` tag exists in the InfluxDB line.
|
||||
- Field names are mapped to time series names prefixed with {measurement}{separator} value,
|
||||
where {separator} equals to _ by default.
|
||||
It can be changed with `--influx-measurement-field-separator` command-line flag.
|
||||
* Field values are mapped to time series values.
|
||||
* Tags are mapped to Prometheus labels format as-is.
|
||||
- Field values are mapped to time series values.
|
||||
- Tags are mapped to Prometheus labels format as-is.
|
||||
|
||||
For example, the following InfluxDB line:
|
||||
|
||||
```
|
||||
foo,tag1=value1,tag2=value2 field1=12,field2=40
|
||||
```
|
||||
|
||||
is converted into the following Prometheus format data points:
|
||||
|
||||
```
|
||||
foo_field1{tag1="value1", tag2="value2"} 12
|
||||
foo_field2{tag1="value1", tag2="value2"} 40
|
||||
@@ -238,7 +252,7 @@ foo_field2{tag1="value1", tag2="value2"} 40
|
||||
|
||||
### Configuration
|
||||
|
||||
The configuration flags should contain self-explanatory descriptions.
|
||||
The configuration flags should contain self-explanatory descriptions.
|
||||
|
||||
### Filtering
|
||||
|
||||
@@ -246,6 +260,7 @@ The filtering consists of two parts: timeseries and time.
|
||||
The first step of application is to select all available timeseries
|
||||
for given database and retention. User may specify additional filtering
|
||||
condition via `--influx-filter-series` flag. For example:
|
||||
|
||||
```
|
||||
./vmctl influx --influx-database benchmark \
|
||||
--influx-filter-series "on benchmark from cpu where hostname='host_1703'"
|
||||
@@ -256,13 +271,15 @@ InfluxDB import mode
|
||||
2020/01/26 14:23:29 fetching series: command: "show series on benchmark from cpu where hostname='host_1703'"; database: "benchmark"; retention: "autogen"
|
||||
Found 10 timeseries to import. Continue? [Y/n]
|
||||
```
|
||||
|
||||
The timeseries select query would be following:
|
||||
`fetching series: command: "show series on benchmark from cpu where hostname='host_1703'"; database: "benchmark"; retention: "autogen"`
|
||||
|
||||
|
||||
The second step of filtering is a time filter and it applies when fetching the datapoints from Influx.
|
||||
Time filtering may be configured with two flags:
|
||||
* --influx-filter-time-start
|
||||
* --influx-filter-time-end
|
||||
|
||||
- --influx-filter-time-start
|
||||
- --influx-filter-time-end
|
||||
Here's an example of importing timeseries for one day only:
|
||||
`./vmctl influx --influx-database benchmark --influx-filter-series "where hostname='host_1703'" --influx-filter-time-start "2020-01-01T10:07:00Z" --influx-filter-time-end "2020-01-01T15:07:00Z"`
|
||||
|
||||
@@ -271,36 +288,36 @@ Please see more about time filtering [here](https://docs.influxdata.com/influxdb
|
||||
## Migrating data from InfluxDB (2.x)
|
||||
|
||||
Migrating data from InfluxDB v2.x is not supported yet ([#32](https://github.com/VictoriaMetrics/vmctl/issues/32)).
|
||||
You may find useful a 3rd party solution for this - https://github.com/jonppe/influx_to_victoriametrics.
|
||||
|
||||
You may find useful a 3rd party solution for this - <https://github.com/jonppe/influx_to_victoriametrics>.
|
||||
|
||||
## Migrating data from Prometheus
|
||||
|
||||
`vmctl` supports the `prometheus` mode for migrating data from Prometheus to VictoriaMetrics time-series database.
|
||||
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
|
||||
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
|
||||
|
||||
See `./vmctl prometheus --help` for details and full list of flags. Also see Prometheus related articles [here](#articles).
|
||||
|
||||
To use migration tool please specify the file path to Prometheus snapshot `--prom-snapshot` (see how to make a snapshot [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data)) and VictoriaMetrics address `--vm-addr`.
|
||||
Please note, that `vmctl` *do not make a snapshot from Prometheus*, it uses an already prepared snapshot. More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data) and [here](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043).
|
||||
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
As soon as required flags are provided and all endpoints are accessible, `vmctl` will start the Prometheus snapshot exploration.
|
||||
Basically, it just fetches all available blocks in provided snapshot and read the metadata. It also does initial filtering by time
|
||||
if flags `--prom-filter-time-start` or `--prom-filter-time-end` were set. The exploration procedure prints some stats from read blocks.
|
||||
Please note that stats are not taking into account timeseries or samples filtering. This will be done during importing process.
|
||||
|
||||
|
||||
The importing process takes the snapshot blocks revealed from Explore procedure and processes them one by one
|
||||
accumulating timeseries and samples. Please note, that `vmctl` relies on responses from InfluxDB on this stage,
|
||||
so ensure that Explore queries are executed without errors or limits. Please see this
|
||||
so ensure that Explore queries are executed without errors or limits. Please see this
|
||||
[issue](https://github.com/VictoriaMetrics/vmctl/issues/30) for details.
|
||||
The data processed in chunks and then sent to VM.
|
||||
|
||||
The importing process example for local installation of Prometheus
|
||||
The importing process example for local installation of Prometheus
|
||||
and single-node VictoriaMetrics(`http://localhost:8428`):
|
||||
|
||||
```
|
||||
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
|
||||
--vm-concurrency=1 \
|
||||
@@ -327,7 +344,7 @@ Found 14 blocks to import. Continue? [Y/n] y
|
||||
import requests: 323;
|
||||
import requests retries: 0;
|
||||
2020/02/23 15:50:03 Total time: 51.077451066s
|
||||
```
|
||||
```
|
||||
|
||||
### Data mapping
|
||||
|
||||
@@ -336,7 +353,7 @@ So no data changes will be applied.
|
||||
|
||||
### Configuration
|
||||
|
||||
The configuration flags should contain self-explanatory descriptions.
|
||||
The configuration flags should contain self-explanatory descriptions.
|
||||
|
||||
### Filtering
|
||||
|
||||
@@ -347,6 +364,7 @@ in in RFC3339 format. This filter applied twice: to drop blocks out of range and
|
||||
overlapping time range.
|
||||
|
||||
Example of applying time filter:
|
||||
|
||||
```
|
||||
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
|
||||
--prom-filter-time-start=2020-02-07T00:07:01Z \
|
||||
@@ -366,12 +384,13 @@ Please notice, that total amount of blocks in provided snapshot is 14, but only
|
||||
time range. So other 12 blocks were marked as `skipped`. The amount of samples and series is not taken into account,
|
||||
since this is heavy operation and will be done during import process.
|
||||
|
||||
Filtering by timeseries is configured with following flags:
|
||||
|
||||
Filtering by timeseries is configured with following flags:
|
||||
* `--prom-filter-label` - the label name, e.g. `__name__` or `instance`;
|
||||
* `--prom-filter-label-value` - the regular expression to filter the label value. By default matches all `.*`
|
||||
- `--prom-filter-label` - the label name, e.g. `__name__` or `instance`;
|
||||
- `--prom-filter-label-value` - the regular expression to filter the label value. By default matches all `.*`
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
|
||||
--prom-filter-label="__name__" \
|
||||
@@ -405,38 +424,44 @@ Found 2 blocks to import. Continue? [Y/n] y
|
||||
|
||||
Thanos uses the same storage engine as Prometheus and the data layout on-disk should be the same. That means
|
||||
`vmctl` in mode `prometheus` may be used for Thanos historical data migration as well.
|
||||
These instructions may vary based on the details of your Thanos configuration.
|
||||
Please read carefully and verify as you go. We assume you're using Thanos Sidecar on your Prometheus pods,
|
||||
These instructions may vary based on the details of your Thanos configuration.
|
||||
Please read carefully and verify as you go. We assume you're using Thanos Sidecar on your Prometheus pods,
|
||||
and that you have a separate Thanos Store installation.
|
||||
|
||||
### Current data
|
||||
|
||||
1. For now, keep your Thanos Sidecar and Thanos-related Prometheus configuration, but add this to also stream
|
||||
1. For now, keep your Thanos Sidecar and Thanos-related Prometheus configuration, but add this to also stream
|
||||
metrics to VictoriaMetrics:
|
||||
|
||||
```
|
||||
remote_write:
|
||||
- url: http://victoria-metrics:8428/api/v1/write
|
||||
```
|
||||
2. Make sure VM is running, of course. Now check the logs to make sure that Prometheus is sending and VM is receiving.
|
||||
|
||||
2. Make sure VM is running, of course. Now check the logs to make sure that Prometheus is sending and VM is receiving.
|
||||
In Prometheus, make sure there are no errors. On the VM side, you should see messages like this:
|
||||
|
||||
```
|
||||
2020-04-27T18:38:46.474Z info VictoriaMetrics/lib/storage/partition.go:207 creating a partition "2020_04" with smallPartsPath="/victoria-metrics-data/data/small/2020_04", bigPartsPath="/victoria-metrics-data/data/big/2020_04"
|
||||
2020-04-27T18:38:46.506Z info VictoriaMetrics/lib/storage/partition.go:222 partition "2020_04" has been created
|
||||
2020-04-27T18:38:46.474Z info VictoriaMetrics/lib/storage/partition.go:207 creating a partition "2020_04" with smallPartsPath="/victoria-metrics-data/data/small/2020_04", bigPartsPath="/victoria-metrics-data/data/big/2020_04"
|
||||
2020-04-27T18:38:46.506Z info VictoriaMetrics/lib/storage/partition.go:222 partition "2020_04" has been created
|
||||
```
|
||||
|
||||
3. Now just wait. Within two hours, Prometheus should finish its current data file and hand it off to Thanos Store for long term
|
||||
storage.
|
||||
|
||||
### Historical data
|
||||
|
||||
Let's assume your data is stored on S3 served by minio. You first need to copy that out to a local filesystem,
|
||||
Let's assume your data is stored on S3 served by minio. You first need to copy that out to a local filesystem,
|
||||
then import it into VM using `vmctl` in `prometheus` mode.
|
||||
|
||||
1. Copy data from minio.
|
||||
1. Run the `minio/mc` Docker container.
|
||||
1. `mc config host add minio http://minio:9000 accessKey secretKey`, substituting appropriate values for the last 3 items.
|
||||
1. `mc cp -r minio/prometheus thanos-data`
|
||||
1. Import using `vmctl`.
|
||||
1. Follow the [instructions](#how-to-build) to compile `vmctl` on your machine.
|
||||
1. Use [prometheus](#migrating-data-from-prometheus) mode to import data:
|
||||
1. Use [prometheus](#migrating-data-from-prometheus) mode to import data:
|
||||
|
||||
```
|
||||
vmctl prometheus --prom-snapshot thanos-data --vm-addr http://victoria-metrics:8428
|
||||
```
|
||||
@@ -453,8 +478,8 @@ or higher.
|
||||
|
||||
See `./vmctl vm-native --help` for details and full list of flags.
|
||||
|
||||
In this mode `vmctl` acts as a proxy between two VM instances, where time series filtering is done by "source" (`src`)
|
||||
and processing is done by "destination" (`dst`). Because of that, `vmctl` doesn't actually know how much data will be
|
||||
In this mode `vmctl` acts as a proxy between two VM instances, where time series filtering is done by "source" (`src`)
|
||||
and processing is done by "destination" (`dst`). Because of that, `vmctl` doesn't actually know how much data will be
|
||||
processed and can't show the progress bar. It will show the current processing speed and total number of processed bytes:
|
||||
|
||||
```
|
||||
@@ -468,20 +493,36 @@ Initing export pipe from "http://localhost:8528" with filters:
|
||||
Initing import process to "http://localhost:8428":
|
||||
Total: 336.75 KiB ↖ Speed: 454.46 KiB p/s
|
||||
2020/10/13 17:04:59 Total time: 952.143376ms
|
||||
```
|
||||
```
|
||||
|
||||
Importing tips:
|
||||
1. Migrating all the metrics from one VM to another may collide with existing application metrics
|
||||
(prefixed with `vm_`) at destination and lead to confusion when using
|
||||
[official Grafana dashboards](https://grafana.com/orgs/victoriametrics/dashboards).
|
||||
|
||||
1. Migrating all the metrics from one VM to another may collide with existing application metrics
|
||||
(prefixed with `vm_`) at destination and lead to confusion when using
|
||||
[official Grafana dashboards](https://grafana.com/orgs/victoriametrics/dashboards).
|
||||
To avoid such situation try to filter out VM process metrics via `--vm-native-filter-match` flag.
|
||||
2. Migration is a backfilling process, so it is recommended to read
|
||||
2. Migration is a backfilling process, so it is recommended to read
|
||||
[Backfilling tips](https://github.com/VictoriaMetrics/VictoriaMetrics#backfilling) section.
|
||||
3. `vmctl` doesn't provide relabeling or other types of labels management in this mode.
|
||||
Instead, use [relabeling in VictoriaMetrics](https://github.com/VictoriaMetrics/vmctl/issues/4#issuecomment-683424375).
|
||||
4. When importing in or from cluster version remember to use correct [URL format](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format)
|
||||
and specify `accountID` param.
|
||||
|
||||
## Verifying exported blocks from VictoriaMetrics
|
||||
|
||||
In this mode, `vmctl` allows verifying correctness and integrity of data exported via [native format](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-export-data-in-native-format) from VictoriaMetrics.
|
||||
You can verify exported data at disk before uploading it by `vmctl verify-block` command:
|
||||
|
||||
```bash
|
||||
# export blocks from VictoriaMetrics
|
||||
curl localhost:8428/api/v1/export/native -g -d 'match[]={__name__!=""}' -o exported_data_block
|
||||
# verify block content
|
||||
./vmctl verify-block exported_data_block
|
||||
2022/03/30 18:04:50 verifying block at path="exported_data_block"
|
||||
2022/03/30 18:04:50 successfully verified block at path="exported_data_block", blockCount=123786
|
||||
2022/03/30 18:04:50 Total time: 100.108ms
|
||||
```
|
||||
|
||||
## Tuning
|
||||
|
||||
### InfluxDB mode
|
||||
@@ -491,7 +532,7 @@ timeseries. Please set it wisely to avoid InfluxDB overwhelming.
|
||||
|
||||
The flag `--influx-chunk-size` controls the max amount of datapoints to return in single chunk from fetch requests.
|
||||
Please see more details [here](https://docs.influxdata.com/influxdb/v1.7/guides/querying_data/#chunking).
|
||||
The chunk size is used to control InfluxDB memory usage, so it won't OOM on processing large timeseries with
|
||||
The chunk size is used to control InfluxDB memory usage, so it won't OOM on processing large timeseries with
|
||||
billions of datapoints.
|
||||
|
||||
### Prometheus mode
|
||||
@@ -507,17 +548,18 @@ Please note that each import request can load up to a single vCPU core on Victor
|
||||
to allocated CPU resources of your VictoriMetrics installation.
|
||||
|
||||
The flag `--vm-batch-size` controls max amount of samples collected before sending the import request.
|
||||
For example, if `--influx-chunk-size=500` and `--vm-batch-size=2000` then importer will process not more
|
||||
than 4 chunks before sending the request.
|
||||
For example, if `--influx-chunk-size=500` and `--vm-batch-size=2000` then importer will process not more
|
||||
than 4 chunks before sending the request.
|
||||
|
||||
### Importer stats
|
||||
|
||||
After successful import `vmctl` prints some statistics for details.
|
||||
After successful import `vmctl` prints some statistics for details.
|
||||
The important numbers to watch are following:
|
||||
- `idle duration` - shows time that importer spent while waiting for data from InfluxDB/Prometheus
|
||||
|
||||
- `idle duration` - shows time that importer spent while waiting for data from InfluxDB/Prometheus
|
||||
to fill up `--vm-batch-size` batch size. Value shows total duration across all workers configured
|
||||
via `--vm-concurrency`. High value may be a sign of too slow InfluxDB/Prometheus fetches or too
|
||||
high `--vm-concurrency` value. Try to improve it by increasing `--<mode>-concurrency` value or
|
||||
high `--vm-concurrency` value. Try to improve it by increasing `--<mode>-concurrency` value or
|
||||
decreasing `--vm-concurrency` value.
|
||||
- `import requests` - shows how many import requests were issued to VM server.
|
||||
The import request is issued once the batch size(`--vm-batch-size`) is full and ready to be sent.
|
||||
@@ -529,6 +571,7 @@ a sign of network issues or VM being overloaded. See the logs during import for
|
||||
|
||||
By default `vmctl` waits confirmation from user before starting the import. If this is unwanted
|
||||
behavior and no user interaction required - pass `-s` flag to enable "silence" mode:
|
||||
|
||||
```
|
||||
-s Whether to run in silent mode. If set to true no confirmation prompts will appear. (default: false)
|
||||
```
|
||||
@@ -537,18 +580,18 @@ behavior and no user interaction required - pass `-s` flag to enable "silence" m
|
||||
|
||||
`vmctl` allows to limit the number of [significant figures](https://en.wikipedia.org/wiki/Significant_figures)
|
||||
before importing. For example, the average value for response size is `102.342305` bytes and it has 9 significant figures.
|
||||
If you ask a human to pronounce this value then with high probability value will be rounded to first 4 or 5 figures
|
||||
because the rest aren't really that important to mention. In most cases, such a high precision is too much.
|
||||
Moreover, such values may be just a result of [floating point arithmetic](https://en.wikipedia.org/wiki/Floating-point_arithmetic),
|
||||
create a [false precision](https://en.wikipedia.org/wiki/False_precision) and result into bad compression ratio
|
||||
according to [information theory](https://en.wikipedia.org/wiki/Information_theory).
|
||||
If you ask a human to pronounce this value then with high probability value will be rounded to first 4 or 5 figures
|
||||
because the rest aren't really that important to mention. In most cases, such a high precision is too much.
|
||||
Moreover, such values may be just a result of [floating point arithmetic](https://en.wikipedia.org/wiki/Floating-point_arithmetic),
|
||||
create a [false precision](https://en.wikipedia.org/wiki/False_precision) and result into bad compression ratio
|
||||
according to [information theory](https://en.wikipedia.org/wiki/Information_theory).
|
||||
|
||||
`vmctl` provides the following flags for improving data compression:
|
||||
|
||||
* `--vm-round-digits` flag for rounding processed values to the given number of decimal digits after the point.
|
||||
- `--vm-round-digits` flag for rounding processed values to the given number of decimal digits after the point.
|
||||
For example, `--vm-round-digits=2` would round `1.2345` to `1.23`. By default the rounding is disabled.
|
||||
|
||||
* `--vm-significant-figures` flag for limiting the number of significant figures in processed values. It takes no effect if set
|
||||
- `--vm-significant-figures` flag for limiting the number of significant figures in processed values. It takes no effect if set
|
||||
to 0 (by default), but set `--vm-significant-figures=5` and `102.342305` will be rounded to `102.34`.
|
||||
|
||||
The most common case for using these flags is to improve data compression for time series storing aggregation
|
||||
@@ -556,18 +599,25 @@ results such as `average`, `rate`, etc.
|
||||
|
||||
### Adding extra labels
|
||||
|
||||
`vmctl` allows to add extra labels to all imported series. It can be achived with flag `--vm-extra-label label=value`.
|
||||
`vmctl` allows to add extra labels to all imported series. It can be achived with flag `--vm-extra-label label=value`.
|
||||
If multiple labels needs to be added, set flag for each label, for example, `--vm-extra-label label1=value1 --vm-extra-label label2=value2`.
|
||||
If timeseries already have label, that must be added with `--vm-extra-label` flag, flag has priority and will override label value from timeseries.
|
||||
|
||||
### Rate limiting
|
||||
|
||||
Limiting the rate of data transfer could help to reduce pressure on disk or on destination database.
|
||||
The rate limit may be set in bytes-per-second via `--vm-rate-limit` flag.
|
||||
|
||||
Please note, you can also use [vmagent](https://docs.victoriametrics.com/vmagent.html)
|
||||
as a proxy between `vmctl` and destination with `-remoteWrite.rateLimit` flag enabled.
|
||||
|
||||
## How to build
|
||||
|
||||
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmctl` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmctl` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -596,7 +646,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
#### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl-arm` or `vmctl-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
|
||||
@@ -7,7 +7,8 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
globalSilent = "s"
|
||||
globalSilent = "s"
|
||||
globalVerbose = "verbose"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -17,6 +18,11 @@ var (
|
||||
Value: false,
|
||||
Usage: "Whether to run in silent mode. If set to true no confirmation prompts will appear.",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: globalVerbose,
|
||||
Value: false,
|
||||
Usage: "Whether to enable verbosity in logs output.",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -30,7 +36,10 @@ const (
|
||||
vmBatchSize = "vm-batch-size"
|
||||
vmSignificantFigures = "vm-significant-figures"
|
||||
vmRoundDigits = "vm-round-digits"
|
||||
vmExtraLabel = "vm-extra-label"
|
||||
|
||||
// also used in vm-native
|
||||
vmExtraLabel = "vm-extra-label"
|
||||
vmRateLimit = "vm-rate-limit"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -95,6 +104,11 @@ var (
|
||||
Usage: "Extra labels, that will be added to imported timeseries. In case of collision, label value defined by flag" +
|
||||
"will have priority. Flag can be set multiple times, to add few additional labels.",
|
||||
},
|
||||
&cli.Int64Flag{
|
||||
Name: vmRateLimit,
|
||||
Usage: "Optional data transfer rate limit in bytes per second.\n" +
|
||||
"By default the rate limit is disabled. It can be useful for limiting load on configured via '--vmAddr' destination.",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -354,6 +368,11 @@ var (
|
||||
Usage: "Extra labels, that will be added to imported timeseries. In case of collision, label value defined by flag" +
|
||||
"will have priority. Flag can be set multiple times, to add few additional labels.",
|
||||
},
|
||||
&cli.Int64Flag{
|
||||
Name: vmRateLimit,
|
||||
Usage: "Optional data transfer rate limit in bytes per second.\n" +
|
||||
"By default the rate limit is disabled. It can be useful for limiting load on source or destination databases.",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ func newInfluxProcessor(ic *influx.Client, im *vm.Importer, cc int, separator st
|
||||
}
|
||||
}
|
||||
|
||||
func (ip *influxProcessor) run(silent bool) error {
|
||||
func (ip *influxProcessor) run(silent, verbose bool) error {
|
||||
series, err := ip.ic.Explore()
|
||||
if err != nil {
|
||||
return fmt.Errorf("explore query failed: %s", err)
|
||||
@@ -70,7 +70,7 @@ func (ip *influxProcessor) run(silent bool) error {
|
||||
case infErr := <-errCh:
|
||||
return fmt.Errorf("influx error: %s", infErr)
|
||||
case vmErr := <-ip.im.Errors():
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
case seriesCh <- s:
|
||||
}
|
||||
}
|
||||
@@ -80,7 +80,9 @@ func (ip *influxProcessor) run(silent bool) error {
|
||||
ip.im.Close()
|
||||
// drain import errors channel
|
||||
for vmErr := range ip.im.Errors() {
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
if vmErr.Err != nil {
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
}
|
||||
}
|
||||
bar.Finish()
|
||||
log.Println("Import finished!")
|
||||
|
||||
53
app/vmctl/limiter/limiter.go
Normal file
53
app/vmctl/limiter/limiter.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package limiter
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
|
||||
)
|
||||
|
||||
// NewLimiter creates a Limiter object
|
||||
// for the given perSecondLimit
|
||||
func NewLimiter(perSecondLimit int64) *Limiter {
|
||||
return &Limiter{perSecondLimit: perSecondLimit}
|
||||
}
|
||||
|
||||
// Limiter controls the amount of budget
|
||||
// that can be spent according to configured perSecondLimit
|
||||
type Limiter struct {
|
||||
perSecondLimit int64
|
||||
|
||||
// mu protects budget and deadline from concurrent access.
|
||||
mu sync.Mutex
|
||||
|
||||
// The current budget. It is increased by perSecondLimit every second.
|
||||
budget int64
|
||||
|
||||
// The next deadline for increasing the budget by perSecondLimit
|
||||
deadline time.Time
|
||||
}
|
||||
|
||||
// Register blocks for amount of time
|
||||
// needed to process the given dataLen according
|
||||
// to the configured perSecondLimit.
|
||||
func (l *Limiter) Register(dataLen int) {
|
||||
limit := l.perSecondLimit
|
||||
if limit <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
for l.budget <= 0 {
|
||||
if d := time.Until(l.deadline); d > 0 {
|
||||
t := timerpool.Get(d)
|
||||
<-t.C
|
||||
timerpool.Put(t)
|
||||
}
|
||||
l.budget += limit
|
||||
l.deadline = time.Now().Add(time.Second)
|
||||
}
|
||||
l.budget -= int64(dataLen)
|
||||
}
|
||||
37
app/vmctl/limiter/writer.go
Normal file
37
app/vmctl/limiter/writer.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package limiter
|
||||
|
||||
import (
|
||||
"io"
|
||||
)
|
||||
|
||||
// NewWriteLimiter creates a new WriteLimiter object
|
||||
// for the give writer and Limiter.
|
||||
func NewWriteLimiter(w io.Writer, limiter *Limiter) *WriteLimiter {
|
||||
return &WriteLimiter{
|
||||
writer: w,
|
||||
limiter: limiter,
|
||||
}
|
||||
}
|
||||
|
||||
// WriteLimiter limits the amount of bytes written
|
||||
// per second via Write() method.
|
||||
// Must be created via NewWriteLimiter.
|
||||
type WriteLimiter struct {
|
||||
writer io.Writer
|
||||
limiter *Limiter
|
||||
}
|
||||
|
||||
// Close implements io.Closer
|
||||
// also calls Close for wrapped io.WriteCloser
|
||||
func (wl *WriteLimiter) Close() error {
|
||||
if c, ok := wl.writer.(io.Closer); ok {
|
||||
return c.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write implements io.Writer
|
||||
func (wl *WriteLimiter) Write(p []byte) (n int, err error) {
|
||||
wl.limiter.Register(len(p))
|
||||
return wl.writer.Write(p)
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -14,10 +15,17 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
|
||||
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/native"
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var (
|
||||
err error
|
||||
importer *vm.Importer
|
||||
)
|
||||
|
||||
start := time.Now()
|
||||
app := &cli.App{
|
||||
Name: "vmctl",
|
||||
@@ -53,7 +61,7 @@ func main() {
|
||||
}
|
||||
|
||||
otsdbProcessor := newOtsdbProcessor(otsdbClient, importer, c.Int(otsdbConcurrency))
|
||||
return otsdbProcessor.run(c.Bool(globalSilent))
|
||||
return otsdbProcessor.run(c.Bool(globalSilent), c.Bool(globalVerbose))
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -82,14 +90,14 @@ func main() {
|
||||
}
|
||||
|
||||
vmCfg := initConfigVM(c)
|
||||
importer, err := vm.NewImporter(vmCfg)
|
||||
importer, err = vm.NewImporter(vmCfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create VM importer: %s", err)
|
||||
}
|
||||
|
||||
processor := newInfluxProcessor(influxClient, importer,
|
||||
c.Int(influxConcurrency), c.String(influxMeasurementFieldSeparator))
|
||||
return processor.run(c.Bool(globalSilent))
|
||||
return processor.run(c.Bool(globalSilent), c.Bool(globalVerbose))
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -100,7 +108,7 @@ func main() {
|
||||
fmt.Println("Prometheus import mode")
|
||||
|
||||
vmCfg := initConfigVM(c)
|
||||
importer, err := vm.NewImporter(vmCfg)
|
||||
importer, err = vm.NewImporter(vmCfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create VM importer: %s", err)
|
||||
}
|
||||
@@ -123,7 +131,7 @@ func main() {
|
||||
im: importer,
|
||||
cc: c.Int(promConcurrency),
|
||||
}
|
||||
return pp.run(c.Bool(globalSilent))
|
||||
return pp.run(c.Bool(globalSilent), c.Bool(globalVerbose))
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -138,6 +146,7 @@ func main() {
|
||||
}
|
||||
|
||||
p := vmNativeProcessor{
|
||||
rateLimit: c.Int64(vmRateLimit),
|
||||
filter: filter{
|
||||
match: c.String(vmNativeFilterMatch),
|
||||
timeStart: c.String(vmNativeFilterTimeStart),
|
||||
@@ -158,6 +167,39 @@ func main() {
|
||||
return p.run()
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "verify-block",
|
||||
Usage: "Verifies exported block with VictoriaMetrics Native format",
|
||||
Flags: []cli.Flag{
|
||||
&cli.BoolFlag{
|
||||
Name: "gunzip",
|
||||
Usage: "Use GNU zip decompression for exported block",
|
||||
Value: false,
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
common.StartUnmarshalWorkers()
|
||||
blockPath := c.Args().First()
|
||||
isBlockGzipped := c.Bool("gunzip")
|
||||
if len(blockPath) == 0 {
|
||||
return cli.Exit("you must provide path for exported data block", 1)
|
||||
}
|
||||
log.Printf("verifying block at path=%q", blockPath)
|
||||
f, err := os.OpenFile(blockPath, os.O_RDONLY, 0600)
|
||||
if err != nil {
|
||||
return cli.Exit(fmt.Errorf("cannot open exported block at path=%q err=%w", blockPath, err), 1)
|
||||
}
|
||||
var blocksCount uint64
|
||||
if err := parser.ParseStream(f, isBlockGzipped, func(block *parser.Block) error {
|
||||
atomic.AddUint64(&blocksCount, 1)
|
||||
return nil
|
||||
}); err != nil {
|
||||
return cli.Exit(fmt.Errorf("cannot parse block at path=%q, blocksCount=%d, err=%w", blockPath, blocksCount, err), 1)
|
||||
}
|
||||
log.Printf("successfully verified block at path=%q, blockCount=%d", blockPath, blocksCount)
|
||||
return nil
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -166,12 +208,14 @@ func main() {
|
||||
go func() {
|
||||
<-c
|
||||
fmt.Println("\r- Execution cancelled")
|
||||
os.Exit(0)
|
||||
if importer != nil {
|
||||
importer.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
err := app.Run(os.Args)
|
||||
err = app.Run(os.Args)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
log.Println(err)
|
||||
}
|
||||
log.Printf("Total time: %v", time.Since(start))
|
||||
}
|
||||
@@ -188,5 +232,6 @@ func initConfigVM(c *cli.Context) vm.Config {
|
||||
SignificantFigures: c.Int(vmSignificantFigures),
|
||||
RoundDigits: c.Int(vmRoundDigits),
|
||||
ExtraLabels: c.StringSlice(vmExtraLabel),
|
||||
RateLimit: c.Int64(vmRateLimit),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ func newOtsdbProcessor(oc *opentsdb.Client, im *vm.Importer, otsdbcc int) *otsdb
|
||||
}
|
||||
}
|
||||
|
||||
func (op *otsdbProcessor) run(silent bool) error {
|
||||
func (op *otsdbProcessor) run(silent, verbose bool) error {
|
||||
log.Println("Loading all metrics from OpenTSDB for filters: ", op.oc.Filters)
|
||||
var metrics []string
|
||||
for _, filter := range op.oc.Filters {
|
||||
@@ -111,7 +111,7 @@ func (op *otsdbProcessor) run(silent bool) error {
|
||||
case otsdbErr := <-errCh:
|
||||
return fmt.Errorf("opentsdb error: %s", otsdbErr)
|
||||
case vmErr := <-op.im.Errors():
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
case seriesCh <- queryObj{
|
||||
Tr: tr, StartTime: startTime,
|
||||
Series: series, Rt: opentsdb.RetentionMeta{
|
||||
@@ -133,7 +133,9 @@ func (op *otsdbProcessor) run(silent bool) error {
|
||||
}
|
||||
op.im.Close()
|
||||
for vmErr := range op.im.Errors() {
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
if vmErr.Err != nil {
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
}
|
||||
}
|
||||
log.Println("Import finished!")
|
||||
log.Print(op.im.Stats())
|
||||
@@ -143,7 +145,7 @@ func (op *otsdbProcessor) run(silent bool) error {
|
||||
func (op *otsdbProcessor) do(s queryObj) error {
|
||||
start := s.StartTime - s.Tr.Start
|
||||
end := s.StartTime - s.Tr.End
|
||||
data, err := op.oc.GetData(s.Series, s.Rt, start, end)
|
||||
data, err := op.oc.GetData(s.Series, s.Rt, start, end, op.oc.MsecsTime)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ type Client struct {
|
||||
Filters []string
|
||||
Normalize bool
|
||||
HardTS int64
|
||||
MsecsTime bool
|
||||
}
|
||||
|
||||
// Config contains fields required
|
||||
@@ -82,9 +83,9 @@ type MetaResults struct {
|
||||
// Meta A meta object about a metric
|
||||
// only contain the tags/etc. and no data
|
||||
type Meta struct {
|
||||
//tsuid string
|
||||
Metric string `json:"metric"`
|
||||
Tags map[string]string `json:"tags"`
|
||||
//tsuid string
|
||||
}
|
||||
|
||||
// OtsdbMetric is a single series in OpenTSDB's returned format
|
||||
@@ -152,7 +153,7 @@ func (c Client) FindSeries(metric string) ([]Meta, error) {
|
||||
|
||||
// GetData actually retrieves data for a series at a specified time range
|
||||
// e.g. /api/query?start=1&end=200&m=sum:1m-avg-none:system.load5{host=host1}
|
||||
func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (Metric, error) {
|
||||
func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, mSecs bool) (Metric, error) {
|
||||
/*
|
||||
First, build our tag string.
|
||||
It's literally just key=value,key=value,...
|
||||
@@ -195,7 +196,7 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (
|
||||
3. bad format of response body
|
||||
*/
|
||||
if resp.StatusCode != 200 {
|
||||
log.Println(fmt.Sprintf("bad response code from OpenTSDB query %v...skipping", resp.StatusCode))
|
||||
log.Println(fmt.Sprintf("bad response code from OpenTSDB query %v for %q...skipping", resp.StatusCode, q))
|
||||
return Metric{}, nil
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
@@ -272,7 +273,11 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (
|
||||
then convert the timestamp back to something reasonable.
|
||||
*/
|
||||
for ts, val := range output[0].Dps {
|
||||
data.Timestamps = append(data.Timestamps, ts)
|
||||
if !mSecs {
|
||||
data.Timestamps = append(data.Timestamps, ts*1000)
|
||||
} else {
|
||||
data.Timestamps = append(data.Timestamps, ts)
|
||||
}
|
||||
data.Values = append(data.Values, val)
|
||||
}
|
||||
return data, nil
|
||||
@@ -283,6 +288,7 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (
|
||||
func NewClient(cfg Config) (*Client, error) {
|
||||
var retentions []Retention
|
||||
offsetPrint := int64(time.Now().Unix())
|
||||
// convert a number of days to seconds
|
||||
offsetSecs := cfg.Offset * 24 * 60 * 60
|
||||
if cfg.MsecsTime {
|
||||
// 1000000 == Nanoseconds -> Milliseconds difference
|
||||
@@ -318,6 +324,7 @@ func NewClient(cfg Config) (*Client, error) {
|
||||
Filters: cfg.Filters,
|
||||
Normalize: cfg.Normalize,
|
||||
HardTS: cfg.HardTS,
|
||||
MsecsTime: cfg.MsecsTime,
|
||||
}
|
||||
return client, nil
|
||||
}
|
||||
|
||||
@@ -87,6 +87,34 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
|
||||
if len(chunks) != 3 {
|
||||
return Retention{}, fmt.Errorf("invalid retention string: %q", retention)
|
||||
}
|
||||
queryLengthDuration, err := convertDuration(chunks[2])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid ttl (second order) duration string: %q: %s", chunks[2], err)
|
||||
}
|
||||
// set ttl in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
|
||||
queryLength := queryLengthDuration.Milliseconds()
|
||||
if !msecTime {
|
||||
queryLength = queryLength / 1000
|
||||
}
|
||||
queryRange := queryLength
|
||||
// bump by the offset so we don't look at empty ranges any time offset > ttl
|
||||
queryLength += offset
|
||||
|
||||
// first/second order aggregations for queries defined in chunk 0...
|
||||
aggregates := strings.Split(chunks[0], "-")
|
||||
if len(aggregates) != 3 {
|
||||
return Retention{}, fmt.Errorf("invalid aggregation string: %q", chunks[0])
|
||||
}
|
||||
|
||||
aggTimeDuration, err := convertDuration(aggregates[1])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid aggregation time duration string: %q: %s", aggregates[1], err)
|
||||
}
|
||||
aggTime := aggTimeDuration.Milliseconds()
|
||||
if !msecTime {
|
||||
aggTime = aggTime / 1000
|
||||
}
|
||||
|
||||
rowLengthDuration, err := convertDuration(chunks[1])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid row length (first order) duration string: %q: %s", chunks[1], err)
|
||||
@@ -96,27 +124,35 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
|
||||
if !msecTime {
|
||||
rowLength = rowLength / 1000
|
||||
}
|
||||
ttlDuration, err := convertDuration(chunks[2])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid ttl (second order) duration string: %q: %s", chunks[2], err)
|
||||
|
||||
var querySize int64
|
||||
/*
|
||||
The idea here is to ensure each individual query sent to OpenTSDB is *at least*
|
||||
large enough to ensure no single query requests essentially 0 data.
|
||||
*/
|
||||
if rowLength > aggTime {
|
||||
/*
|
||||
We'll look at 2x the row size for each query we perform
|
||||
This is a strange function, but the logic works like this:
|
||||
1. we discover the "number" of ranges we should split the time range into
|
||||
This is found with queryRange / (rowLength * 4)...kind of a percentage query
|
||||
2. we discover the actual size of each "chunk"
|
||||
This is second division step
|
||||
*/
|
||||
querySize = int64(queryRange / (queryRange / (rowLength * 4)))
|
||||
} else {
|
||||
/*
|
||||
Unless the aggTime (how long a range of data we're requesting per individual point)
|
||||
is greater than the row size. Then we'll need to use that to determine
|
||||
how big each individual query should be
|
||||
*/
|
||||
querySize = int64(queryRange / (queryRange / (aggTime * 4)))
|
||||
}
|
||||
// set ttl in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
|
||||
ttl := ttlDuration.Milliseconds()
|
||||
if !msecTime {
|
||||
ttl = ttl / 1000
|
||||
}
|
||||
// bump by the offset so we don't look at empty ranges any time offset > ttl
|
||||
ttl += offset
|
||||
|
||||
var timeChunks []TimeRange
|
||||
var i int64
|
||||
for i = offset; i <= ttl; i = i + rowLength {
|
||||
timeChunks = append(timeChunks, TimeRange{Start: i + rowLength, End: i})
|
||||
}
|
||||
// first/second order aggregations for queries defined in chunk 0...
|
||||
aggregates := strings.Split(chunks[0], "-")
|
||||
if len(aggregates) != 3 {
|
||||
return Retention{}, fmt.Errorf("invalid aggregation string: %q", chunks[0])
|
||||
for i = offset; i <= queryLength; i = i + querySize {
|
||||
timeChunks = append(timeChunks, TimeRange{Start: i + querySize, End: i})
|
||||
}
|
||||
|
||||
ret := Retention{FirstOrder: aggregates[0],
|
||||
|
||||
@@ -8,7 +8,7 @@ func TestConvertRetention(t *testing.T) {
|
||||
/*
|
||||
2592000 seconds in 30 days
|
||||
3600 in one hour
|
||||
2592000 / 3600 = 720 individual query "ranges" should exist, plus one because time ranges can be weird
|
||||
2592000 / 14400 = 180 individual query "ranges" should exist, plus one because time ranges can be weird
|
||||
First order should == "sum"
|
||||
Second order should == "avg"
|
||||
AggTime should == "1m"
|
||||
@@ -17,8 +17,8 @@ func TestConvertRetention(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("Error parsing valid retention string: %v", err)
|
||||
}
|
||||
if len(res.QueryRanges) != 721 {
|
||||
t.Fatalf("Found %v query ranges. Should have found 720", len(res.QueryRanges))
|
||||
if len(res.QueryRanges) != 181 {
|
||||
t.Fatalf("Found %v query ranges. Should have found 181", len(res.QueryRanges))
|
||||
}
|
||||
if res.FirstOrder != "sum" {
|
||||
t.Fatalf("Incorrect first order aggregation %q. Should have been 'sum'", res.FirstOrder)
|
||||
|
||||
@@ -25,7 +25,7 @@ type prometheusProcessor struct {
|
||||
cc int
|
||||
}
|
||||
|
||||
func (pp *prometheusProcessor) run(silent bool) error {
|
||||
func (pp *prometheusProcessor) run(silent, verbose bool) error {
|
||||
blocks, err := pp.cl.Explore()
|
||||
if err != nil {
|
||||
return fmt.Errorf("explore failed: %s", err)
|
||||
@@ -66,7 +66,7 @@ func (pp *prometheusProcessor) run(silent bool) error {
|
||||
return fmt.Errorf("prometheus error: %s", promErr)
|
||||
case vmErr := <-pp.im.Errors():
|
||||
close(blockReadersCh)
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
case blockReadersCh <- br:
|
||||
}
|
||||
}
|
||||
@@ -77,7 +77,9 @@ func (pp *prometheusProcessor) run(silent bool) error {
|
||||
pp.im.Close()
|
||||
// drain import errors channel
|
||||
for vmErr := range pp.im.Errors() {
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
if vmErr.Err != nil {
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, verbose))
|
||||
}
|
||||
}
|
||||
bar.Finish()
|
||||
log.Println("Import finished!")
|
||||
|
||||
@@ -23,11 +23,29 @@ func prompt(question string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func wrapErr(vmErr *vm.ImportError) error {
|
||||
func wrapErr(vmErr *vm.ImportError, verbose bool) error {
|
||||
var errTS string
|
||||
var maxTS, minTS int64
|
||||
for _, ts := range vmErr.Batch {
|
||||
errTS += fmt.Sprintf("%s for timestamps range %d - %d\n",
|
||||
ts.String(), ts.Timestamps[0], ts.Timestamps[len(ts.Timestamps)-1])
|
||||
if minTS < ts.Timestamps[0] || minTS == 0 {
|
||||
minTS = ts.Timestamps[0]
|
||||
}
|
||||
if maxTS < ts.Timestamps[len(ts.Timestamps)-1] {
|
||||
maxTS = ts.Timestamps[len(ts.Timestamps)-1]
|
||||
}
|
||||
if verbose {
|
||||
errTS += fmt.Sprintf("%s for timestamps range %d - %d\n",
|
||||
ts.String(), ts.Timestamps[0], ts.Timestamps[len(ts.Timestamps)-1])
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("%s with error: %s", errTS, vmErr.Err)
|
||||
var verboseMsg string
|
||||
if !verbose {
|
||||
verboseMsg = "(enable `--verbose` output to get more details)"
|
||||
}
|
||||
if vmErr.Err == nil {
|
||||
return fmt.Errorf("%s\n\tLatest delivered batch for timestamps range %d - %d %s\n%s",
|
||||
vmErr.Err, minTS, maxTS, verboseMsg, errTS)
|
||||
}
|
||||
return fmt.Errorf("%s\n\tImporting batch failed for timestamps range %d - %d %s\n%s",
|
||||
vmErr.Err, minTS, maxTS, verboseMsg, errTS)
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/limiter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
)
|
||||
|
||||
@@ -47,6 +48,9 @@ type Config struct {
|
||||
RoundDigits int
|
||||
// ExtraLabels that will be added to all imported series. Must be in label=value format.
|
||||
ExtraLabels []string
|
||||
// RateLimit defines a data transfer speed in bytes per second.
|
||||
// Is applied to each worker (see Concurrency) independently.
|
||||
RateLimit int64
|
||||
}
|
||||
|
||||
// Importer performs insertion of timeseries
|
||||
@@ -63,6 +67,8 @@ type Importer struct {
|
||||
input chan *TimeSeries
|
||||
errors chan *ImportError
|
||||
|
||||
rl *limiter.Limiter
|
||||
|
||||
wg sync.WaitGroup
|
||||
once sync.Once
|
||||
|
||||
@@ -123,6 +129,7 @@ func NewImporter(cfg Config) (*Importer, error) {
|
||||
compress: cfg.Compress,
|
||||
user: cfg.User,
|
||||
password: cfg.Password,
|
||||
rl: limiter.NewLimiter(cfg.RateLimit),
|
||||
close: make(chan struct{}),
|
||||
input: make(chan *TimeSeries, cfg.Concurrency*4),
|
||||
errors: make(chan *ImportError, cfg.Concurrency),
|
||||
@@ -149,9 +156,11 @@ func NewImporter(cfg Config) (*Importer, error) {
|
||||
// ImportError is type of error generated
|
||||
// in case of unsuccessful import request
|
||||
type ImportError struct {
|
||||
// The batch of timeseries that failed
|
||||
// The batch of timeseries processed by importer at the moment
|
||||
Batch []*TimeSeries
|
||||
// The error that appeared during insert
|
||||
// If err is nil - no error happened and Batch
|
||||
// Is the latest delivered Batch.
|
||||
Err error
|
||||
}
|
||||
|
||||
@@ -180,12 +189,13 @@ func (im *Importer) startWorker(batchSize, significantFigures, roundDigits int)
|
||||
for {
|
||||
select {
|
||||
case <-im.close:
|
||||
if err := im.Import(batch); err != nil {
|
||||
im.errors <- &ImportError{
|
||||
Batch: batch,
|
||||
Err: err,
|
||||
}
|
||||
exitErr := &ImportError{
|
||||
Batch: batch,
|
||||
}
|
||||
if err := im.Import(batch); err != nil {
|
||||
exitErr.Err = err
|
||||
}
|
||||
im.errors <- exitErr
|
||||
return
|
||||
case ts := <-im.input:
|
||||
// init waitForBatch when first
|
||||
@@ -301,12 +311,13 @@ func (im *Importer) Import(tsBatch []*TimeSeries) error {
|
||||
|
||||
w := io.Writer(pw)
|
||||
if im.compress {
|
||||
zw, err := gzip.NewWriterLevel(pw, 1)
|
||||
zw, err := gzip.NewWriterLevel(w, 1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unexpected error when creating gzip writer: %s", err)
|
||||
}
|
||||
w = zw
|
||||
}
|
||||
w = limiter.NewWriteLimiter(w, im.rl)
|
||||
bw := bufio.NewWriterSize(w, 16*1024)
|
||||
|
||||
var totalSamples, totalBytes int
|
||||
@@ -321,8 +332,8 @@ func (im *Importer) Import(tsBatch []*TimeSeries) error {
|
||||
if err := bw.Flush(); err != nil {
|
||||
return err
|
||||
}
|
||||
if im.compress {
|
||||
err := w.(*gzip.Writer).Close()
|
||||
if closer, ok := w.(io.Closer); ok {
|
||||
err := closer.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -7,12 +7,15 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/limiter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
)
|
||||
|
||||
type vmNativeProcessor struct {
|
||||
filter filter
|
||||
filter filter
|
||||
rateLimit int64
|
||||
|
||||
dst *vmNativeClient
|
||||
src *vmNativeClient
|
||||
@@ -84,7 +87,12 @@ func (p *vmNativeProcessor) run() error {
|
||||
bar := pb.ProgressBarTemplate(barTpl).Start64(0)
|
||||
barReader := bar.NewProxyReader(exportReader)
|
||||
|
||||
_, err = io.Copy(pw, barReader)
|
||||
w := io.Writer(pw)
|
||||
if p.rateLimit > 0 {
|
||||
rl := limiter.NewLimiter(p.rateLimit)
|
||||
w = limiter.NewWriteLimiter(pw, rl)
|
||||
}
|
||||
_, err = io.Copy(w, barReader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write into %q: %s", p.dst.addr, err)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# vmgateway
|
||||
|
||||
***vmgateway is a part of [enterprise package](https://victoriametrics.com/enterprise.html). It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)***
|
||||
|
||||
***vmgateway is a part of [enterprise package](https://victoriametrics.com/products/enterprise/). It is available for download and evaluation at [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)***
|
||||
|
||||
<img alt="vmgateway" src="vmgateway-overview.jpeg">
|
||||
|
||||
@@ -14,8 +13,7 @@
|
||||
* Provides access by tenantID in the Cluster version
|
||||
* Allows for separate write/read/admin access to data
|
||||
|
||||
`vmgateway` is included in our [enterprise packages](https://victoriametrics.com/enterprise.html).
|
||||
|
||||
`vmgateway` is included in our [enterprise packages](https://victoriametrics.com/products/enterprise/).
|
||||
|
||||
## Access Control
|
||||
|
||||
@@ -24,6 +22,7 @@
|
||||
`vmgateway` supports jwt based authentication. With jwt payload can be configured to give access to specific tenants and labels as well as to read/write.
|
||||
|
||||
jwt token must be in following format:
|
||||
|
||||
```json
|
||||
{
|
||||
"exp": 1617304574,
|
||||
@@ -36,16 +35,20 @@ jwt token must be in following format:
|
||||
"team": "dev",
|
||||
"project": "mobile"
|
||||
},
|
||||
"extra_filters": ["{env~=\"prod|dev\",team!=\"test\"}"],
|
||||
"mode": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Where:
|
||||
- `exp` - required, expire time in unix_timestamp. If the token expires then `vmgateway` rejects the request.
|
||||
- `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenand_id": {}}`
|
||||
- `tenant_id` - optional, for cluster mode, routes requests to the corresponding tenant.
|
||||
- `extra_labels` - optional, key-value pairs for label filters added to the ingested or selected metrics.
|
||||
- `mode` - optional, access mode for api - read, write, or full. Supported values: 0 - full (default value), 1 - read, 2 - write.
|
||||
|
||||
* `exp` - required, expire time in unix_timestamp. If the token expires then `vmgateway` rejects the request.
|
||||
* `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenand_id": {}}`
|
||||
* `tenant_id` - optional, for cluster mode, routes requests to the corresponding tenant.
|
||||
* `extra_labels` - optional, key-value pairs for label filters added to the ingested or selected metrics. Multiple filters are added with `and` operation. If defined, `extra_label` from original request removed.
|
||||
* `extra_filters` - optional, [series selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) added to the select query requests. Multiple selectors are added with `or` operation. If defined, `extra_filter` from original request removed.
|
||||
* `mode` - optional, access mode for api - read, write, or full. Supported values: 0 - full (default value), 1 - read, 2 - write.
|
||||
|
||||
## QuickStart
|
||||
|
||||
@@ -64,18 +67,19 @@ Start vmgateway
|
||||
```
|
||||
|
||||
Retrieve data from the database
|
||||
|
||||
```bash
|
||||
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ2bV9hY2Nlc3MiOnsidGVuYW50X2lkIjp7fSwicm9sZSI6MX0sImV4cCI6MTkzOTM0NjIxMH0.5WUxEfdcV9hKo4CtQdtuZYOGpGXWwaqM9VuVivMMrVg'
|
||||
```
|
||||
|
||||
A request with an incorrect token or without any token will be rejected:
|
||||
|
||||
```bash
|
||||
curl 'http://localhost:8431/api/v1/series/count'
|
||||
|
||||
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer incorrect-token'
|
||||
```
|
||||
|
||||
|
||||
## Rate Limiter
|
||||
|
||||
<img alt="vmgateway-rl" src="vmgateway-rate-limiting.jpg">
|
||||
@@ -86,14 +90,16 @@ Limits incoming requests by given, pre-configured limits. It supports read and w
|
||||
The metrics that you want to rate limit must be scraped from the cluster.
|
||||
|
||||
List of supported limit types:
|
||||
- `queries` - count of api requests made at tenant to read the api, such as `/api/v1/query`, `/api/v1/series` and others.
|
||||
- `active_series` - count of current active series at any given tenant.
|
||||
- `new_series` - count of created series; aka churn rate
|
||||
- `rows_inserted` - count of inserted rows per tenant.
|
||||
|
||||
* `queries` - count of api requests made at tenant to read the api, such as `/api/v1/query`, `/api/v1/series` and others.
|
||||
* `active_series` - count of current active series at any given tenant.
|
||||
* `new_series` - count of created series; aka churn rate
|
||||
* `rows_inserted` - count of inserted rows per tenant.
|
||||
|
||||
List of supported time windows:
|
||||
- `minute`
|
||||
- `hour`
|
||||
|
||||
* `minute`
|
||||
* `hour`
|
||||
|
||||
Limits can be specified per tenant or at a global level if you omit `project_id` and `account_id`.
|
||||
|
||||
@@ -117,6 +123,7 @@ limits:
|
||||
## QuickStart
|
||||
|
||||
cluster version of VictoriaMetrics is required for rate limiting.
|
||||
|
||||
```bash
|
||||
# start datasource for cluster metrics
|
||||
|
||||
@@ -167,6 +174,7 @@ curl 'http://localhost:8431/api/v1/labels' -H 'Authorization: Bearer eyJhbGciOiJ
|
||||
## Configuration
|
||||
|
||||
The shortlist of configuration flags include the following:
|
||||
|
||||
```console
|
||||
-clusterMode
|
||||
enable this for the cluster version
|
||||
@@ -205,7 +213,7 @@ The shortlist of configuration flags include the following:
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/legal/eula/
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches as they cannot read data files larger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
@@ -274,12 +282,11 @@ The shortlist of configuration flags include the following:
|
||||
## TroubleShooting
|
||||
|
||||
* Access control:
|
||||
* incorrect `jwt` format, try https://jwt.io/#debugger-io with our tokens
|
||||
* incorrect `jwt` format, try <https://jwt.io/#debugger-io> with our tokens
|
||||
* expired token, check `exp` field.
|
||||
* Rate Limiting:
|
||||
* `scrape_interval` at datasource, reduce it to apply limits faster.
|
||||
|
||||
|
||||
## Limitations
|
||||
|
||||
* Access Control:
|
||||
|
||||
@@ -23,6 +23,7 @@ var (
|
||||
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via InfluxDB line protocol")
|
||||
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if InfluxDB line contains only a single field")
|
||||
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
|
||||
dbLabel = flag.String("influxDBLabel", "db", "Default label for the DB name sent over '?db={db_name}' query parameter")
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -80,13 +81,13 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
|
||||
hasDBKey := false
|
||||
for j := range r.Tags {
|
||||
tag := &r.Tags[j]
|
||||
if tag.Key == "db" {
|
||||
if tag.Key == *dbLabel {
|
||||
hasDBKey = true
|
||||
}
|
||||
ic.AddLabel(tag.Key, tag.Value)
|
||||
}
|
||||
if !hasDBKey {
|
||||
ic.AddLabel("db", db)
|
||||
ic.AddLabel(*dbLabel, db)
|
||||
}
|
||||
for j := range extraLabels {
|
||||
label := &extraLabels[j]
|
||||
|
||||
@@ -148,6 +148,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return true
|
||||
case "/influx/write", "/influx/api/v2/write", "/write", "/api/v2/write":
|
||||
influxWriteRequests.Inc()
|
||||
addInfluxResponseHeaders(w)
|
||||
if err := influx.InsertHandlerForHTTP(r); err != nil {
|
||||
influxWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
@@ -157,6 +158,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return true
|
||||
case "/influx/query", "/query":
|
||||
influxQueryRequests.Inc()
|
||||
addInfluxResponseHeaders(w)
|
||||
influxutils.WriteDatabaseNames(w)
|
||||
return true
|
||||
case "/datadog/api/v1/series":
|
||||
@@ -199,6 +201,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
state := r.FormValue("state")
|
||||
promscrape.WriteAPIV1Targets(w, state)
|
||||
return true
|
||||
case "/prometheus/target_response", "/target_response":
|
||||
promscrapeTargetResponseRequests.Inc()
|
||||
if err := promscrape.WriteTargetResponse(w, r); err != nil {
|
||||
promscrapeTargetResponseErrors.Inc()
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
return true
|
||||
case "/prometheus/config", "/config":
|
||||
if *configAuthKey != "" && r.FormValue("authKey") != *configAuthKey {
|
||||
err := &httpserver.ErrorWithStatusCode{
|
||||
@@ -233,6 +243,12 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func addInfluxResponseHeaders(w http.ResponseWriter) {
|
||||
// This is needed for some clients, which expect InfluxDB version header.
|
||||
// See, for example, https://github.com/ntop/ntopng/issues/5449#issuecomment-1005347597
|
||||
w.Header().Set("X-Influxdb-Version", "1.8.0")
|
||||
}
|
||||
|
||||
var (
|
||||
requestDuration = metrics.NewHistogram(`vminsert_request_duration_seconds`)
|
||||
|
||||
@@ -266,6 +282,9 @@ var (
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
|
||||
promscrapeAPIV1TargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/targets"}`)
|
||||
|
||||
promscrapeTargetResponseRequests = metrics.NewCounter(`vm_http_requests_total{path="/target_response"}`)
|
||||
promscrapeTargetResponseErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/target_response"}`)
|
||||
|
||||
promscrapeConfigRequests = metrics.NewCounter(`vm_http_requests_total{path="/config"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
@@ -27,8 +27,9 @@ func InsertHandler(req *http.Request) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
isGzip := req.Header.Get("Content-Encoding") == "gzip"
|
||||
return writeconcurrencylimiter.Do(func() error {
|
||||
return parser.ParseStream(req, func(block *parser.Block) error {
|
||||
return parser.ParseStream(req.Body, isGzip, func(block *parser.Block) error {
|
||||
return insertRows(block, extraLabels)
|
||||
})
|
||||
})
|
||||
|
||||
@@ -27,6 +27,12 @@ vmrestore-ppc64le-prod:
|
||||
vmrestore-386-prod:
|
||||
APP_NAME=vmrestore $(MAKE) app-via-docker-386
|
||||
|
||||
vmrestore-darwin-amd64-prod:
|
||||
APP_NAME=vmrestore $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmrestore-darwin-arm64-prod:
|
||||
APP_NAME=vmrestore $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
package-vmrestore:
|
||||
APP_NAME=vmrestore $(MAKE) package-via-docker
|
||||
|
||||
|
||||
@@ -6,12 +6,11 @@ VictoriaMetrics `v1.29.0` and newer versions must be used for working with the r
|
||||
Restore process can be interrupted at any time. It is automatically resumed from the interruption point
|
||||
when restarting `vmrestore` with the same args.
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
VictoriaMetrics must be stopped during the restore process.
|
||||
|
||||
```
|
||||
```bash
|
||||
vmrestore -src=gs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/restore>
|
||||
|
||||
```
|
||||
@@ -24,13 +23,11 @@ vmrestore -src=gs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/re
|
||||
The original `-storageDataPath` directory may contain old files. They will be substituted by the files from backup,
|
||||
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* If `vmrestore` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmrestore` has been interrupted due to temporary error, then just restart it with the same args. It will resume the restore process.
|
||||
|
||||
|
||||
## Advanced usage
|
||||
|
||||
* Obtaining credentials from a file.
|
||||
@@ -38,6 +35,7 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
|
||||
Add flag `-credsFilePath=/etc/credentials` with following content:
|
||||
|
||||
for s3 (aws, minio or other s3 compatible storages):
|
||||
|
||||
```bash
|
||||
[default]
|
||||
aws_access_key_id=theaccesskey
|
||||
@@ -45,6 +43,7 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
|
||||
```
|
||||
|
||||
for gce cloud storage:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "service_account",
|
||||
@@ -62,7 +61,8 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
|
||||
|
||||
* Usage with s3 custom url endpoint. It is possible to use `vmrestore` with s3 api compatible storages, like minio, cloudian and other.
|
||||
You have to add custom url endpoint with a flag:
|
||||
```
|
||||
|
||||
```bash
|
||||
# for minio:
|
||||
-customS3Endpoint=http://localhost:9000
|
||||
|
||||
@@ -70,100 +70,98 @@ i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/q
|
||||
-customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com
|
||||
```
|
||||
|
||||
* Run `vmrestore -help` in order to see all the available options:
|
||||
* Run `vmrestore -help` in order to see all the available options:
|
||||
|
||||
```
|
||||
```bash
|
||||
-concurrency int
|
||||
The number of concurrent workers. Higher concurrency may reduce restore duration (default 10)
|
||||
The number of concurrent workers. Higher concurrency may reduce restore duration (default 10)
|
||||
-configFilePath string
|
||||
Path to file with S3 configs. Configs are loaded from default location if not set.
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
Path to file with S3 configs. Configs are loaded from default location if not set.
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
-configProfile string
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
|
||||
-credsFilePath string
|
||||
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
|
||||
-customS3Endpoint string
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set. See https://docs.victoriametrics.com/#environment-variables for more details
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address for exporting metrics at /metrics page (default ":8421")
|
||||
TCP address for exporting metrics at /metrics page (default ":8421")
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxBytesPerSecond size
|
||||
The maximum download speed. There is no limit if it is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
The maximum download speed. There is no limit if it is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /metrics. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
Auth key for /debug/pprof. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-s3ForcePathStyle
|
||||
Prefixing endpoint with bucket name when set false, true by default. (default true)
|
||||
Prefixing endpoint with bucket name when set false, true by default. (default true)
|
||||
-skipBackupCompleteCheck
|
||||
Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file
|
||||
Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file
|
||||
-src string
|
||||
Source path with backup on the remote storage. Example: gs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
Source path with backup on the remote storage. Example: gs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
|
||||
-storageDataPath string
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
|
||||
## How to build from sources
|
||||
|
||||
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
|
||||
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.17.
|
||||
2. Run `make vmrestore` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmrestore` binary and puts it into the `bin` folder.
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
@@ -59,7 +60,7 @@ func (bw *Writer) Write(p []byte) (int, error) {
|
||||
return 0, bw.err
|
||||
}
|
||||
n, err := bw.bw.Write(p)
|
||||
if err != nil {
|
||||
if err != nil && !isTrivialNetworkError(err) {
|
||||
bw.err = fmt.Errorf("cannot send %d bytes to client: %w", len(p), err)
|
||||
}
|
||||
return n, bw.err
|
||||
@@ -72,7 +73,7 @@ func (bw *Writer) Flush() error {
|
||||
if bw.err != nil {
|
||||
return bw.err
|
||||
}
|
||||
if err := bw.bw.Flush(); err != nil {
|
||||
if err := bw.bw.Flush(); err != nil && !isTrivialNetworkError(err) {
|
||||
bw.err = fmt.Errorf("cannot flush data to client: %w", err)
|
||||
}
|
||||
return bw.err
|
||||
@@ -84,3 +85,12 @@ func (bw *Writer) Error() error {
|
||||
defer bw.lock.Unlock()
|
||||
return bw.err
|
||||
}
|
||||
|
||||
func isTrivialNetworkError(err error) bool {
|
||||
// Suppress trivial network errors, which could occur at remote side.
|
||||
s := err.Error()
|
||||
if strings.Contains(s, "broken pipe") || strings.Contains(s, "reset by peer") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ func TagsDelSeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.Re
|
||||
})
|
||||
}
|
||||
tfss := joinTagFilterss(tfs, etfs)
|
||||
sq := storage.NewSearchQuery(0, ct, tfss)
|
||||
sq := storage.NewSearchQuery(0, ct, tfss, 0)
|
||||
n, err := netstorage.DeleteSeries(sq, deadline)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot delete series for %q: %w", sq, err)
|
||||
@@ -196,7 +196,7 @@ func TagsAutoCompleteValuesHandler(startTime time.Time, w http.ResponseWriter, r
|
||||
}
|
||||
} else {
|
||||
// Slow path: use netstorage.SearchMetricNames for applying `expr` filters.
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs)
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs, limit*10)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -282,7 +282,7 @@ func TagsAutoCompleteTagsHandler(startTime time.Time, w http.ResponseWriter, r *
|
||||
}
|
||||
} else {
|
||||
// Slow path: use netstorage.SearchMetricNames for applying `expr` filters.
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs)
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs, limit*10)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -349,7 +349,7 @@ func TagsFindSeriesHandler(startTime time.Time, w http.ResponseWriter, r *http.R
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot setup tag filters: %w", err)
|
||||
}
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs)
|
||||
sq, err := getSearchQueryForExprs(startTime, etfs, exprs, limit*10)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -474,14 +474,14 @@ func getInt(r *http.Request, argName string) (int, error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func getSearchQueryForExprs(startTime time.Time, etfs [][]storage.TagFilter, exprs []string) (*storage.SearchQuery, error) {
|
||||
func getSearchQueryForExprs(startTime time.Time, etfs [][]storage.TagFilter, exprs []string, maxMetrics int) (*storage.SearchQuery, error) {
|
||||
tfs, err := exprsToTagFilters(exprs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ct := startTime.UnixNano() / 1e6
|
||||
tfss := joinTagFilterss(tfs, etfs)
|
||||
sq := storage.NewSearchQuery(0, ct, tfss)
|
||||
sq := storage.NewSearchQuery(0, ct, tfss, maxMetrics)
|
||||
return sq, nil
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user