mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-01 08:02:53 +03:00
Compare commits
567 Commits
docs/vmano
...
v1.95.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3545633934 | ||
|
|
e9d86d7e52 | ||
|
|
aefd744abb | ||
|
|
aae06e003e | ||
|
|
84b3c1d3bc | ||
|
|
2ea03cf80d | ||
|
|
1fbd0dd9d8 | ||
|
|
8924fea33a | ||
|
|
8dfc874be3 | ||
|
|
61035419d5 | ||
|
|
2cbdb1db22 | ||
|
|
d389a4fcf3 | ||
|
|
ec7cac3641 | ||
|
|
0a28c8e91b | ||
|
|
98e73a4022 | ||
|
|
17c45d1206 | ||
|
|
85bf63078c | ||
|
|
7d4873bcef | ||
|
|
a79a439dac | ||
|
|
4c25ee3597 | ||
|
|
ebd76588da | ||
|
|
9b248e3b2f | ||
|
|
0ea35215f6 | ||
|
|
91f5c24f82 | ||
|
|
8d9e365512 | ||
|
|
741013a33f | ||
|
|
d9a7dea9a1 | ||
|
|
e837968e49 | ||
|
|
5bfa2a3e97 | ||
|
|
68d2cb203d | ||
|
|
02624c41eb | ||
|
|
ef2113e5df | ||
|
|
7b2e2a23c2 | ||
|
|
efd5d1f2dd | ||
|
|
201fb6ec5c | ||
|
|
3076c1f400 | ||
|
|
6a533023b1 | ||
|
|
0bf48a5d2a | ||
|
|
feff13851c | ||
|
|
3121d76bee | ||
|
|
cb106bdf39 | ||
|
|
5d61a7327d | ||
|
|
d3ae2b2f62 | ||
|
|
d6ae082598 | ||
|
|
fbc6289a21 | ||
|
|
f9bd265249 | ||
|
|
7cf7740d18 | ||
|
|
0116a333fb | ||
|
|
43e3302803 | ||
|
|
0e056ddb2d | ||
|
|
12aefa8a4b | ||
|
|
37997abd14 | ||
|
|
cef7a39ba3 | ||
|
|
89dcbc2fe7 | ||
|
|
8eed04b2c6 | ||
|
|
0feaeca3c1 | ||
|
|
8af56ea2ed | ||
|
|
cf23dc6480 | ||
|
|
b7fb7c5f77 | ||
|
|
3e93fa61ad | ||
|
|
c8d901424f | ||
|
|
66527c5981 | ||
|
|
6340911d38 | ||
|
|
4722b70c89 | ||
|
|
ba058a4514 | ||
|
|
25ff811d78 | ||
|
|
eded218e8c | ||
|
|
61594d2bd8 | ||
|
|
ec72b750f2 | ||
|
|
bfec8a3751 | ||
|
|
0df8489e0a | ||
|
|
230230cf0b | ||
|
|
80213f07fa | ||
|
|
2db1a664e1 | ||
|
|
010dc15d16 | ||
|
|
22498c5087 | ||
|
|
dc668b8246 | ||
|
|
a9a26c20b5 | ||
|
|
d407d13e7b | ||
|
|
2474281f1b | ||
|
|
73a1862182 | ||
|
|
930d26b2ff | ||
|
|
2b15f2fc8c | ||
|
|
746cc93e95 | ||
|
|
bffd30b57a | ||
|
|
5afc6a5765 | ||
|
|
b51d16e74c | ||
|
|
e4f44bad91 | ||
|
|
ea4ca70be1 | ||
|
|
4d2b98b755 | ||
|
|
c07dc45786 | ||
|
|
f90d2ec843 | ||
|
|
054367c421 | ||
|
|
323f3720ed | ||
|
|
da24c129db | ||
|
|
f2b373dd06 | ||
|
|
815fda8995 | ||
|
|
65db6609eb | ||
|
|
b5254199c6 | ||
|
|
6eb205f8b0 | ||
|
|
90d45574bf | ||
|
|
eeb25bc4a5 | ||
|
|
dd33fc0c76 | ||
|
|
536c1301fd | ||
|
|
87a86ec9db | ||
|
|
ed70a40669 | ||
|
|
828ddd4e4f | ||
|
|
4497a08e3d | ||
|
|
f59dda3223 | ||
|
|
b8739bc00b | ||
|
|
b44b74d118 | ||
|
|
324ecbeed6 | ||
|
|
da887b49e7 | ||
|
|
e482eeff58 | ||
|
|
92826b0b4a | ||
|
|
0189081490 | ||
|
|
3e76c3bd50 | ||
|
|
c4c6ee9485 | ||
|
|
a70818f72f | ||
|
|
ea81f6fc36 | ||
|
|
fba93dbe0b | ||
|
|
41a0fdaf39 | ||
|
|
c96fc05f3e | ||
|
|
51aab7bb17 | ||
|
|
714af89b13 | ||
|
|
98699f203b | ||
|
|
efb6ac27c2 | ||
|
|
68f82b1c06 | ||
|
|
1020faa7b4 | ||
|
|
aade16f534 | ||
|
|
a71efce784 | ||
|
|
4ac95b6f49 | ||
|
|
7ac49162c6 | ||
|
|
f6208965ce | ||
|
|
a950873fff | ||
|
|
a8051d48c4 | ||
|
|
0638bbe69c | ||
|
|
aaf9e3d526 | ||
|
|
9866974a53 | ||
|
|
8874b525b7 | ||
|
|
ca7457d906 | ||
|
|
23369321f1 | ||
|
|
abcb21aa5e | ||
|
|
e964df8039 | ||
|
|
a64b37cf24 | ||
|
|
ad839aa492 | ||
|
|
29cebd82fb | ||
|
|
9149353a36 | ||
|
|
613b545dfd | ||
|
|
90427abc65 | ||
|
|
632d788b63 | ||
|
|
7c90ce39cb | ||
|
|
3aec7eb44f | ||
|
|
b60bb1d98a | ||
|
|
68b1b3c4d4 | ||
|
|
cdbc06a639 | ||
|
|
8b41b506c2 | ||
|
|
5464376d16 | ||
|
|
ac933cc423 | ||
|
|
612dcf231a | ||
|
|
d5a599badc | ||
|
|
c22e3e7b1d | ||
|
|
eed5206376 | ||
|
|
4afcb2a689 | ||
|
|
cb34d4440c | ||
|
|
42dd71bb63 | ||
|
|
305c96e384 | ||
|
|
c07909a20b | ||
|
|
eed0c3c6b0 | ||
|
|
a216fe6728 | ||
|
|
c9375cac5e | ||
|
|
4e0a779efe | ||
|
|
003ef3a518 | ||
|
|
685f9c3c98 | ||
|
|
3e0236b4a3 | ||
|
|
33484d3365 | ||
|
|
239849db5d | ||
|
|
2f21c0c119 | ||
|
|
b8b6e120ff | ||
|
|
ad9e2b21ee | ||
|
|
38b8872a47 | ||
|
|
ad871dd9ed | ||
|
|
ea4758f5cd | ||
|
|
0a6932acfe | ||
|
|
fd2d07ba33 | ||
|
|
188cfe3a85 | ||
|
|
e16d3f5639 | ||
|
|
3e2f09541e | ||
|
|
c2d252c045 | ||
|
|
edba9f6266 | ||
|
|
f89051f83f | ||
|
|
14f3d844fe | ||
|
|
daaf2b0e61 | ||
|
|
da77f4deeb | ||
|
|
484b5ed12f | ||
|
|
6c3dd16a16 | ||
|
|
bdb743c88d | ||
|
|
1d3d989be5 | ||
|
|
4f102ff945 | ||
|
|
b8c267075e | ||
|
|
e25318e8ca | ||
|
|
fc98b62760 | ||
|
|
6f98b9c221 | ||
|
|
56b9c0b717 | ||
|
|
07150359b2 | ||
|
|
b248413a07 | ||
|
|
8b6ccad41d | ||
|
|
7abdbbc8e2 | ||
|
|
97a7128e4b | ||
|
|
2c334ed953 | ||
|
|
ddbe713470 | ||
|
|
3594214a16 | ||
|
|
8dff1c696f | ||
|
|
a200aaf0ef | ||
|
|
b808fe959a | ||
|
|
065a6b020b | ||
|
|
f5c46b8176 | ||
|
|
3d7a77bf82 | ||
|
|
83d64aa621 | ||
|
|
8d3b732ce0 | ||
|
|
60eab8e2ef | ||
|
|
fe8cc573d1 | ||
|
|
1ca0b86397 | ||
|
|
84cf96228f | ||
|
|
b246b0e55f | ||
|
|
d1638db849 | ||
|
|
3f5a41e35e | ||
|
|
36b0cea3f3 | ||
|
|
dc28196237 | ||
|
|
98a5007d32 | ||
|
|
4f4bc515e0 | ||
|
|
964e6ccc17 | ||
|
|
930a36df40 | ||
|
|
0b66840520 | ||
|
|
d984598e30 | ||
|
|
31f7ef0811 | ||
|
|
8c4ce186d3 | ||
|
|
d43566605b | ||
|
|
2fc7e9f47e | ||
|
|
6dc5306c9b | ||
|
|
a6d06474f4 | ||
|
|
1f91f22b5f | ||
|
|
2aa0f5fc41 | ||
|
|
244c887825 | ||
|
|
b57756734e | ||
|
|
c5044cdba9 | ||
|
|
636d4ea196 | ||
|
|
7183621d84 | ||
|
|
dc79b25771 | ||
|
|
cb13ce9147 | ||
|
|
b3cc22b159 | ||
|
|
a8345bb1b9 | ||
|
|
7cf615a73b | ||
|
|
f60c08a7bd | ||
|
|
cc7d5b7bab | ||
|
|
3d5d62e38a | ||
|
|
b576acff5e | ||
|
|
09e6606b38 | ||
|
|
490694bb01 | ||
|
|
8b90bb3bc0 | ||
|
|
c606d3efde | ||
|
|
590bd50df4 | ||
|
|
10677af9df | ||
|
|
cd0683a3af | ||
|
|
ade64b5783 | ||
|
|
d863d11b5a | ||
|
|
75dd7b30ba | ||
|
|
94bdc91a56 | ||
|
|
ce17f81b00 | ||
|
|
c21863a956 | ||
|
|
61935e3387 | ||
|
|
960f72368a | ||
|
|
2dd9b69101 | ||
|
|
de651165bd | ||
|
|
7b35eaa853 | ||
|
|
e330ab7ad9 | ||
|
|
b296c8e95a | ||
|
|
612ec2df6b | ||
|
|
3d99d4b6c0 | ||
|
|
53d2e9ac98 | ||
|
|
861d91dbea | ||
|
|
f13a96f42c | ||
|
|
0387b0b39f | ||
|
|
5c28923c11 | ||
|
|
5eac0cdf42 | ||
|
|
55e9a9e3a0 | ||
|
|
f39045eca6 | ||
|
|
a4bd73ec7e | ||
|
|
133425f0c0 | ||
|
|
34961dd4b8 | ||
|
|
859977d591 | ||
|
|
71668637ce | ||
|
|
10f297e1e7 | ||
|
|
8dce4eb189 | ||
|
|
60694606f1 | ||
|
|
74301cdbf5 | ||
|
|
7b33a27874 | ||
|
|
10d9214980 | ||
|
|
256d800200 | ||
|
|
62de314510 | ||
|
|
7e744f86cb | ||
|
|
da9ef90277 | ||
|
|
d41841c0c9 | ||
|
|
bf6ebc86fd | ||
|
|
3ca6fea858 | ||
|
|
7373d04d54 | ||
|
|
7af9be92cc | ||
|
|
cc3b1267f5 | ||
|
|
5e49b72126 | ||
|
|
dfb82bd126 | ||
|
|
74ca4d09e7 | ||
|
|
859859aa1c | ||
|
|
896c85a4a4 | ||
|
|
94627113db | ||
|
|
8a23d08c21 | ||
|
|
9310e9f584 | ||
|
|
f0e33700fc | ||
|
|
0adec48182 | ||
|
|
1a834b4210 | ||
|
|
af53799332 | ||
|
|
c53b5788b4 | ||
|
|
34a9d1f818 | ||
|
|
4d1b572f46 | ||
|
|
f897d5241d | ||
|
|
223ef96198 | ||
|
|
89a5e27216 | ||
|
|
8e722e10ee | ||
|
|
15dfd94f3b | ||
|
|
717c53af27 | ||
|
|
e453069dcd | ||
|
|
3b9605dba5 | ||
|
|
a740159541 | ||
|
|
34d7a670d0 | ||
|
|
ec50375991 | ||
|
|
33c17a5a2b | ||
|
|
dd98385a10 | ||
|
|
72960732c2 | ||
|
|
8d99c12a7d | ||
|
|
3140ef7261 | ||
|
|
455077cd67 | ||
|
|
71a17dfcbe | ||
|
|
760cdcec68 | ||
|
|
f16cbc726e | ||
|
|
462c918251 | ||
|
|
0c60228fea | ||
|
|
bea3431ed1 | ||
|
|
28aed4d098 | ||
|
|
bdbe616408 | ||
|
|
7b99781b09 | ||
|
|
582f1f8fda | ||
|
|
e9647bb669 | ||
|
|
30a645cd82 | ||
|
|
03fece44e0 | ||
|
|
76af32d869 | ||
|
|
4d01bc6d52 | ||
|
|
f93a7b8457 | ||
|
|
f267733d9a | ||
|
|
f32711e614 | ||
|
|
8b01bc4a5c | ||
|
|
d570763c91 | ||
|
|
eeb862f3ff | ||
|
|
f7dda12b4d | ||
|
|
b6ad581b45 | ||
|
|
264ffe3fa1 | ||
|
|
8d3e574c31 | ||
|
|
2a362e7397 | ||
|
|
f04eb762c1 | ||
|
|
d5f9619984 | ||
|
|
a09c680170 | ||
|
|
9f3f085d7f | ||
|
|
d6cdfd231a | ||
|
|
cbcfbaf488 | ||
|
|
eebef296d4 | ||
|
|
9902418524 | ||
|
|
151f363552 | ||
|
|
a7409500fc | ||
|
|
bb8eda0b0f | ||
|
|
fef0c232e8 | ||
|
|
9bdcf483a7 | ||
|
|
0bbc6a5b43 | ||
|
|
a315694dd9 | ||
|
|
bf4936bbf0 | ||
|
|
6351d07da8 | ||
|
|
b80d338287 | ||
|
|
dd10f94951 | ||
|
|
9de440c803 | ||
|
|
87fea7d8ac | ||
|
|
78dfbe1622 | ||
|
|
9ddb2d8010 | ||
|
|
3151adda2a | ||
|
|
cf6fc2a6b7 | ||
|
|
3da493ff62 | ||
|
|
72d3063bef | ||
|
|
9bccc5aab2 | ||
|
|
2dc33e0ddc | ||
|
|
5f85dd7f80 | ||
|
|
56a94d3b63 | ||
|
|
448baf12a3 | ||
|
|
40c94b26dd | ||
|
|
45c0e4bb31 | ||
|
|
138e02da37 | ||
|
|
081476f3d6 | ||
|
|
7b92f1d038 | ||
|
|
2e04ddbd32 | ||
|
|
7be3848ee6 | ||
|
|
42baf5fe3f | ||
|
|
e04c435875 | ||
|
|
f9e47a9abe | ||
|
|
d19072a2d9 | ||
|
|
c0246b2e17 | ||
|
|
24d61bf193 | ||
|
|
0c7d46d637 | ||
|
|
26cee36499 | ||
|
|
ff4bc9d18e | ||
|
|
e009550312 | ||
|
|
7a716095dc | ||
|
|
82ccae1c02 | ||
|
|
b9a5ea03fa | ||
|
|
8632683990 | ||
|
|
377627c4c9 | ||
|
|
8847fbd34f | ||
|
|
f075977045 | ||
|
|
c112dd7367 | ||
|
|
4e5a68ed08 | ||
|
|
edee262ecc | ||
|
|
e0e856d2e7 | ||
|
|
4bcc086965 | ||
|
|
99384b96ba | ||
|
|
b18eed3427 | ||
|
|
2853fac3f5 | ||
|
|
2a3fa14ad7 | ||
|
|
1c42154785 | ||
|
|
dc4b974a48 | ||
|
|
00685b627f | ||
|
|
3222780707 | ||
|
|
1c0e065216 | ||
|
|
1bba4c5118 | ||
|
|
137fa19d9c | ||
|
|
3f7bde5bac | ||
|
|
c37d7dd567 | ||
|
|
3453e0c455 | ||
|
|
3e963debf8 | ||
|
|
039b8667c4 | ||
|
|
c7887296e5 | ||
|
|
281a37f6f2 | ||
|
|
d087334049 | ||
|
|
5f182cc2c2 | ||
|
|
317a273c6d | ||
|
|
e8db78eaa4 | ||
|
|
8d50032dd6 | ||
|
|
379b92cc10 | ||
|
|
154c691f47 | ||
|
|
5d848363f0 | ||
|
|
eabcfc9bcd | ||
|
|
9d2260ed3c | ||
|
|
0e31415b34 | ||
|
|
a20234d2d0 | ||
|
|
58a6bb7bd1 | ||
|
|
5b96a96535 | ||
|
|
1b6d37b8e2 | ||
|
|
d8a4f01fe9 | ||
|
|
8aa29e9a84 | ||
|
|
8e5958448b | ||
|
|
e4fff13697 | ||
|
|
cde5029bce | ||
|
|
501d8e1978 | ||
|
|
278d3c20a4 | ||
|
|
ef6468584c | ||
|
|
6e8611f301 | ||
|
|
e865989fa9 | ||
|
|
72167a697e | ||
|
|
65415b56af | ||
|
|
f4577005be | ||
|
|
0b0e0bb50e | ||
|
|
ffbebfdfe6 | ||
|
|
4d316a23ae | ||
|
|
f1c2508243 | ||
|
|
4ebe8bb1d5 | ||
|
|
6788704152 | ||
|
|
757ae4275b | ||
|
|
992a1c0a3a | ||
|
|
6dc66ce35b | ||
|
|
ddf87b32ed | ||
|
|
2122eb18fe | ||
|
|
59dee2e714 | ||
|
|
df6d27650e | ||
|
|
6abd575cbe | ||
|
|
946e370b26 | ||
|
|
c5aac34b68 | ||
|
|
0df506de54 | ||
|
|
7961479900 | ||
|
|
ca44b8da1f | ||
|
|
8287749c05 | ||
|
|
ea2fbcf0e6 | ||
|
|
297ef605ef | ||
|
|
9169f65521 | ||
|
|
8ea4ae7dbb | ||
|
|
ee747f4f42 | ||
|
|
7349f18c55 | ||
|
|
e9d246f367 | ||
|
|
7d8b6fbe20 | ||
|
|
b7d07e5acf | ||
|
|
1d7eaf0f5b | ||
|
|
54f522ac25 | ||
|
|
9137703729 | ||
|
|
f5685f1c54 | ||
|
|
cd9f86afe1 | ||
|
|
1e1a30ed7f | ||
|
|
39623ae428 | ||
|
|
6da32a27ac | ||
|
|
f3ef5d16eb | ||
|
|
251d3d54a4 | ||
|
|
a27c2f3773 | ||
|
|
c884311cf5 | ||
|
|
93109842c6 | ||
|
|
481a2c70fd | ||
|
|
507879380b | ||
|
|
fdae53a75b | ||
|
|
63e3571e8c | ||
|
|
214be01dfa | ||
|
|
ac6c40e896 | ||
|
|
a0f695f5de | ||
|
|
3d25a82372 | ||
|
|
4fa0725dc9 | ||
|
|
88b620b8c8 | ||
|
|
11329c3d16 | ||
|
|
fae59146ad | ||
|
|
1a57815769 | ||
|
|
f96804658b | ||
|
|
59f7d810c9 | ||
|
|
e1235267a0 | ||
|
|
05f109ad58 | ||
|
|
d144e39592 | ||
|
|
8faa17493b | ||
|
|
f111ddb862 | ||
|
|
072d891ed9 | ||
|
|
d7067c46d0 | ||
|
|
e8bcb17c8a | ||
|
|
acbe327fdf | ||
|
|
b007f78a2e | ||
|
|
1bd7637fe1 | ||
|
|
e0017b4d47 | ||
|
|
a19a65b3a5 | ||
|
|
4c4bcdf0b1 | ||
|
|
271743f892 | ||
|
|
be5c4818f5 | ||
|
|
ac0b7e0421 | ||
|
|
2328e4cabc | ||
|
|
38e72ea344 | ||
|
|
7ec4ccc005 | ||
|
|
b50ed5ddd1 | ||
|
|
6d0163cca9 | ||
|
|
efb81185a7 | ||
|
|
e49e4f372b | ||
|
|
f2df8ad480 | ||
|
|
7602b95bcb | ||
|
|
6411191158 | ||
|
|
c36259fca5 | ||
|
|
a4a1884237 | ||
|
|
252643d100 | ||
|
|
86f1459ca6 | ||
|
|
cc7bfaca6c | ||
|
|
0a93abfeed | ||
|
|
bd8ecfb551 | ||
|
|
835c03fb47 | ||
|
|
bc5065fd14 | ||
|
|
262c888d4b |
2
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
2
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
@@ -6,7 +6,7 @@ body:
|
||||
attributes:
|
||||
value: |
|
||||
Before filling a bug report it would be great to [upgrade](https://docs.victoriametrics.com/#how-to-upgrade)
|
||||
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
|
||||
and verify whether the bug is reproducible there.
|
||||
It's also recommended to read the [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html) first.
|
||||
- type: textarea
|
||||
|
||||
2
.github/workflows/check-licenses.yml
vendored
2
.github/workflows/check-licenses.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@main
|
||||
with:
|
||||
go-version: 1.20.7
|
||||
go-version: 1.21.4
|
||||
id: go
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
|
||||
2
.github/workflows/codeql-analysis-js.yml
vendored
2
.github/workflows/codeql-analysis-js.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
|
||||
4
.github/workflows/codeql-analysis.yml
vendored
4
.github/workflows/codeql-analysis.yml
vendored
@@ -52,12 +52,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.7
|
||||
go-version: 1.21.4
|
||||
check-latest: true
|
||||
cache: true
|
||||
if: ${{ matrix.language == 'go' }}
|
||||
|
||||
12
.github/workflows/main.yml
vendored
12
.github/workflows/main.yml
vendored
@@ -27,12 +27,12 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.7
|
||||
go-version: 1.21.4
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
@@ -51,12 +51,12 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.7
|
||||
go-version: 1.21.4
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
@@ -75,13 +75,13 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Go
|
||||
id: go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.20.7
|
||||
go-version: 1.21.4
|
||||
check-latest: true
|
||||
cache: true
|
||||
|
||||
|
||||
19
.github/workflows/sync-docs.yml
vendored
19
.github/workflows/sync-docs.yml
vendored
@@ -6,6 +6,9 @@ on:
|
||||
paths:
|
||||
- 'docs/**'
|
||||
workflow_dispatch: {}
|
||||
env:
|
||||
PAGEFIND_VERSION: "1.0.3"
|
||||
HUGO_VERSION: "latest"
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout and to commit back image update
|
||||
deployments: write
|
||||
@@ -15,16 +18,25 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: main
|
||||
- name: Checkout private code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: VictoriaMetrics/vmdocs
|
||||
token: ${{ secrets.VM_BOT_GH_TOKEN }}
|
||||
path: docs
|
||||
|
||||
- uses: peaceiris/actions-hugo@v2
|
||||
with:
|
||||
hugo-version: ${{env.HUGO_VERSION}}
|
||||
extended: true
|
||||
- name: Install PageFind #install the static search engine for index build
|
||||
uses: supplypike/setup-bin@v3
|
||||
with:
|
||||
uri: "https://github.com/CloudCannon/pagefind/releases/download/v${{env.PAGEFIND_VERSION}}/pagefind-v${{env.PAGEFIND_VERSION}}-x86_64-unknown-linux-musl.tar.gz"
|
||||
name: "pagefind"
|
||||
version: ${{env.PAGEFIND_VERSION}}
|
||||
- name: Import GPG key
|
||||
uses: crazy-max/ghaction-import-gpg@v5
|
||||
with:
|
||||
@@ -45,6 +57,7 @@ jobs:
|
||||
rm -rf content
|
||||
cp -r ../main/docs content
|
||||
make clean-after-copy
|
||||
make build-search-index
|
||||
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
|
||||
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
|
||||
git add .
|
||||
|
||||
80
.github/workflows/update-sandbox.yml
vendored
80
.github/workflows/update-sandbox.yml
vendored
@@ -1,80 +0,0 @@
|
||||
name: sandbox-release
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
permissions:
|
||||
contents: write
|
||||
jobs:
|
||||
deploy-sandbox:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: check inputs
|
||||
if: github.event.release.tag_name == ''
|
||||
run: exit 1
|
||||
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: VictoriaMetrics/ops
|
||||
token: ${{ secrets.VM_BOT_GH_TOKEN }}
|
||||
|
||||
- name: Import GPG key
|
||||
id: import-gpg
|
||||
uses: crazy-max/ghaction-import-gpg@v5
|
||||
with:
|
||||
gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }}
|
||||
passphrase: ${{ secrets.VM_BOT_PASSPHRASE }}
|
||||
git_user_signingkey: true
|
||||
git_commit_gpgsign: true
|
||||
|
||||
- name: update image tag
|
||||
uses: fjogeleit/yaml-update-action@main
|
||||
with:
|
||||
valueFile: 'gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml'
|
||||
commitChange: false
|
||||
createPR: false
|
||||
changes: |
|
||||
{
|
||||
"gcp-test/sandbox/manifests/benchmark-vm/vmcluster.yaml": {
|
||||
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
|
||||
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
|
||||
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
|
||||
},
|
||||
"gcp-test/sandbox/manifests/benchmark-vm/vmsingle.yaml": {
|
||||
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
|
||||
},
|
||||
"gcp-test/sandbox/manifests/monitoring/monitoring-vmagent.yaml": {
|
||||
"spec.image.tag": "${{ github.event.release.tag_name }}"
|
||||
},
|
||||
"gcp-test/sandbox/manifests/monitoring/monitoring-vmcluster.yaml": {
|
||||
"spec.vminsert.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
|
||||
"spec.vmselect.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster",
|
||||
"spec.vmstorage.image.tag": "${{ github.event.release.tag_name }}-enterprise-cluster"
|
||||
},
|
||||
"gcp-test/sandbox/manifests/monitoring/vmalert.yaml": {
|
||||
"spec.image.tag": "${{ github.event.release.tag_name }}-enterprise"
|
||||
}
|
||||
}
|
||||
|
||||
- name: commit changes
|
||||
run: |
|
||||
git config --global user.name "${{ steps.import-gpg.outputs.email }}"
|
||||
git config --global user.email "${{ steps.import-gpg.outputs.email }}"
|
||||
git add .
|
||||
git commit -S -m "Deploy image tag ${RELEASE_TAG} to sandbox"
|
||||
env:
|
||||
RELEASE_TAG: ${{ github.event.release.tag_name }}
|
||||
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
|
||||
branch: release-automation
|
||||
token: ${{ secrets.VM_BOT_GH_TOKEN }}
|
||||
delete-branch: true
|
||||
title: "release ${{ github.event.release.tag_name }}"
|
||||
body: |
|
||||
Release [${{ github.event.release.tag_name }}](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/${{ github.event.release.tag_name }}) to sandbox
|
||||
|
||||
> Auto-generated by `Github Actions Bot`
|
||||
|
||||
34
Makefile
34
Makefile
@@ -16,6 +16,7 @@ GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(DATEINFO_TA
|
||||
|
||||
include app/*/Makefile
|
||||
include deployment/*/Makefile
|
||||
include dashboards/Makefile
|
||||
include snap/local/Makefile
|
||||
include package/release/Makefile
|
||||
|
||||
@@ -24,6 +25,7 @@ all: \
|
||||
victoria-logs-prod \
|
||||
vmagent-prod \
|
||||
vmalert-prod \
|
||||
vmalert-tool-prod \
|
||||
vmauth-prod \
|
||||
vmbackup-prod \
|
||||
vmrestore-prod \
|
||||
@@ -32,11 +34,11 @@ all: \
|
||||
clean:
|
||||
rm -rf bin/*
|
||||
|
||||
publish: package-base \
|
||||
publish: \
|
||||
publish-victoria-metrics \
|
||||
publish-victoria-logs \
|
||||
publish-vmagent \
|
||||
publish-vmalert \
|
||||
publish-vmalert-tool \
|
||||
publish-vmauth \
|
||||
publish-vmbackup \
|
||||
publish-vmrestore \
|
||||
@@ -47,6 +49,7 @@ package: \
|
||||
package-victoria-logs \
|
||||
package-vmagent \
|
||||
package-vmalert \
|
||||
package-vmalert-tool \
|
||||
package-vmauth \
|
||||
package-vmbackup \
|
||||
package-vmrestore \
|
||||
@@ -55,6 +58,7 @@ package: \
|
||||
vmutils: \
|
||||
vmagent \
|
||||
vmalert \
|
||||
vmalert-tool \
|
||||
vmauth \
|
||||
vmbackup \
|
||||
vmrestore \
|
||||
@@ -63,6 +67,7 @@ vmutils: \
|
||||
vmutils-pure: \
|
||||
vmagent-pure \
|
||||
vmalert-pure \
|
||||
vmalert-tool-pure \
|
||||
vmauth-pure \
|
||||
vmbackup-pure \
|
||||
vmrestore-pure \
|
||||
@@ -71,6 +76,7 @@ vmutils-pure: \
|
||||
vmutils-linux-amd64: \
|
||||
vmagent-linux-amd64 \
|
||||
vmalert-linux-amd64 \
|
||||
vmalert-tool-linux-amd64 \
|
||||
vmauth-linux-amd64 \
|
||||
vmbackup-linux-amd64 \
|
||||
vmrestore-linux-amd64 \
|
||||
@@ -79,6 +85,7 @@ vmutils-linux-amd64: \
|
||||
vmutils-linux-arm64: \
|
||||
vmagent-linux-arm64 \
|
||||
vmalert-linux-arm64 \
|
||||
vmalert-tool-linux-arm64 \
|
||||
vmauth-linux-arm64 \
|
||||
vmbackup-linux-arm64 \
|
||||
vmrestore-linux-arm64 \
|
||||
@@ -87,6 +94,7 @@ vmutils-linux-arm64: \
|
||||
vmutils-linux-arm: \
|
||||
vmagent-linux-arm \
|
||||
vmalert-linux-arm \
|
||||
vmalert-tool-linux-arm \
|
||||
vmauth-linux-arm \
|
||||
vmbackup-linux-arm \
|
||||
vmrestore-linux-arm \
|
||||
@@ -95,6 +103,7 @@ vmutils-linux-arm: \
|
||||
vmutils-linux-386: \
|
||||
vmagent-linux-386 \
|
||||
vmalert-linux-386 \
|
||||
vmalert-tool-linux-386 \
|
||||
vmauth-linux-386 \
|
||||
vmbackup-linux-386 \
|
||||
vmrestore-linux-386 \
|
||||
@@ -103,6 +112,7 @@ vmutils-linux-386: \
|
||||
vmutils-linux-ppc64le: \
|
||||
vmagent-linux-ppc64le \
|
||||
vmalert-linux-ppc64le \
|
||||
vmalert-tool-linux-ppc64le \
|
||||
vmauth-linux-ppc64le \
|
||||
vmbackup-linux-ppc64le \
|
||||
vmrestore-linux-ppc64le \
|
||||
@@ -111,6 +121,7 @@ vmutils-linux-ppc64le: \
|
||||
vmutils-darwin-amd64: \
|
||||
vmagent-darwin-amd64 \
|
||||
vmalert-darwin-amd64 \
|
||||
vmalert-tool-darwin-amd64 \
|
||||
vmauth-darwin-amd64 \
|
||||
vmbackup-darwin-amd64 \
|
||||
vmrestore-darwin-amd64 \
|
||||
@@ -119,6 +130,7 @@ vmutils-darwin-amd64: \
|
||||
vmutils-darwin-arm64: \
|
||||
vmagent-darwin-arm64 \
|
||||
vmalert-darwin-arm64 \
|
||||
vmalert-tool-darwin-arm64 \
|
||||
vmauth-darwin-arm64 \
|
||||
vmbackup-darwin-arm64 \
|
||||
vmrestore-darwin-arm64 \
|
||||
@@ -127,6 +139,7 @@ vmutils-darwin-arm64: \
|
||||
vmutils-freebsd-amd64: \
|
||||
vmagent-freebsd-amd64 \
|
||||
vmalert-freebsd-amd64 \
|
||||
vmalert-tool-freebsd-amd64 \
|
||||
vmauth-freebsd-amd64 \
|
||||
vmbackup-freebsd-amd64 \
|
||||
vmrestore-freebsd-amd64 \
|
||||
@@ -135,6 +148,7 @@ vmutils-freebsd-amd64: \
|
||||
vmutils-openbsd-amd64: \
|
||||
vmagent-openbsd-amd64 \
|
||||
vmalert-openbsd-amd64 \
|
||||
vmalert-tool-openbsd-amd64 \
|
||||
vmauth-openbsd-amd64 \
|
||||
vmbackup-openbsd-amd64 \
|
||||
vmrestore-openbsd-amd64 \
|
||||
@@ -143,6 +157,7 @@ vmutils-openbsd-amd64: \
|
||||
vmutils-windows-amd64: \
|
||||
vmagent-windows-amd64 \
|
||||
vmalert-windows-amd64 \
|
||||
vmalert-tool-windows-amd64 \
|
||||
vmauth-windows-amd64 \
|
||||
vmbackup-windows-amd64 \
|
||||
vmrestore-windows-amd64 \
|
||||
@@ -174,6 +189,7 @@ vmutils-crossbuild: \
|
||||
vmutils-windows-amd64
|
||||
|
||||
publish-release:
|
||||
rm -rf bin/*
|
||||
git checkout $(TAG) && LATEST_TAG=stable $(MAKE) release publish && \
|
||||
git checkout $(TAG)-cluster && LATEST_TAG=cluster-stable $(MAKE) release publish && \
|
||||
git checkout $(TAG)-enterprise && LATEST_TAG=enterprise-stable $(MAKE) release publish && \
|
||||
@@ -181,7 +197,6 @@ publish-release:
|
||||
|
||||
release: \
|
||||
release-victoria-metrics \
|
||||
release-victoria-logs \
|
||||
release-vmutils
|
||||
|
||||
release-victoria-metrics: \
|
||||
@@ -339,6 +354,7 @@ release-vmutils-windows-amd64:
|
||||
release-vmutils-goos-goarch: \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
|
||||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
@@ -347,6 +363,7 @@ release-vmutils-goos-goarch: \
|
||||
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
|
||||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
@@ -354,6 +371,7 @@ release-vmutils-goos-goarch: \
|
||||
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
|
||||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
@@ -362,6 +380,7 @@ release-vmutils-goos-goarch: \
|
||||
cd bin && rm -rf \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
|
||||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
@@ -370,6 +389,7 @@ release-vmutils-goos-goarch: \
|
||||
release-vmutils-windows-goarch: \
|
||||
vmagent-windows-$(GOARCH)-prod \
|
||||
vmalert-windows-$(GOARCH)-prod \
|
||||
vmalert-tool-windows-$(GOARCH)-prod \
|
||||
vmauth-windows-$(GOARCH)-prod \
|
||||
vmbackup-windows-$(GOARCH)-prod \
|
||||
vmrestore-windows-$(GOARCH)-prod \
|
||||
@@ -378,6 +398,7 @@ release-vmutils-windows-goarch: \
|
||||
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe \
|
||||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
@@ -385,6 +406,7 @@ release-vmutils-windows-goarch: \
|
||||
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe \
|
||||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
@@ -393,6 +415,7 @@ release-vmutils-windows-goarch: \
|
||||
cd bin && rm -rf \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe \
|
||||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
@@ -437,7 +460,7 @@ benchmark-pure:
|
||||
vendor-update:
|
||||
go get -u -d ./lib/...
|
||||
go get -u -d ./app/...
|
||||
go mod tidy -compat=1.19
|
||||
go mod tidy -compat=1.20
|
||||
go mod vendor
|
||||
|
||||
app-local:
|
||||
@@ -463,7 +486,7 @@ golangci-lint: install-golangci-lint
|
||||
golangci-lint run
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.51.2
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.55.1
|
||||
|
||||
govulncheck: install-govulncheck
|
||||
govulncheck ./...
|
||||
@@ -514,3 +537,4 @@ docs-sync:
|
||||
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
|
||||
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
|
||||
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
|
||||
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs
|
||||
|
||||
467
README.md
467
README.md
@@ -13,7 +13,7 @@
|
||||
|
||||
VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
|
||||
|
||||
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
VictoriaMetrics is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
|
||||
[Docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/), [Snap packages](https://snapcraft.io/victoriametrics)
|
||||
and [source code](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
|
||||
@@ -29,7 +29,8 @@ If you have questions about VictoriaMetrics, then feel free asking them at [Vict
|
||||
[Contact us](mailto:info@victoriametrics.com) if you need enterprise support for VictoriaMetrics.
|
||||
See [features available in enterprise package](https://docs.victoriametrics.com/enterprise.html).
|
||||
Enterprise binaries can be downloaded and evaluated for free
|
||||
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
|
||||
from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
|
||||
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking the [CHANGELOG](https://docs.victoriametrics.com/CHANGELOG.html) and performing [regular upgrades](#how-to-upgrade-victoriametrics).
|
||||
|
||||
@@ -86,6 +87,7 @@ VictoriaMetrics has the following prominent features:
|
||||
* [Arbitrary CSV data](#how-to-import-csv-data).
|
||||
* [Native binary format](#how-to-import-data-in-native-format).
|
||||
* [DataDog agent or DogStatsD](#how-to-send-data-from-datadog-agent).
|
||||
* [NewRelic infrastructure agent](#how-to-send-data-from-newrelic-agent).
|
||||
* [OpenTelemetry metrics format](#sending-data-via-opentelemetry).
|
||||
* It supports powerful [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html), which can be used as a [statsd](https://github.com/statsd/statsd) alternative.
|
||||
* It supports metrics [relabeling](#relabeling).
|
||||
@@ -110,6 +112,7 @@ Case studies:
|
||||
* [Brandwatch](https://docs.victoriametrics.com/CaseStudies.html#brandwatch)
|
||||
* [CERN](https://docs.victoriametrics.com/CaseStudies.html#cern)
|
||||
* [COLOPL](https://docs.victoriametrics.com/CaseStudies.html#colopl)
|
||||
* [Criteo](https://docs.victoriametrics.com/CaseStudies.html#criteo)
|
||||
* [Dig Security](https://docs.victoriametrics.com/CaseStudies.html#dig-security)
|
||||
* [Fly.io](https://docs.victoriametrics.com/CaseStudies.html#flyio)
|
||||
* [German Research Center for Artificial Intelligence](https://docs.victoriametrics.com/CaseStudies.html#german-research-center-for-artificial-intelligence)
|
||||
@@ -135,7 +138,8 @@ See also [articles and slides about VictoriaMetrics from our users](https://docs
|
||||
|
||||
### Install
|
||||
|
||||
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
|
||||
To quickly try VictoriaMetrics, just download [VictoriaMetrics executable](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
|
||||
or [Docker image](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and start it with the desired command-line flags.
|
||||
See also [QuickStart guide](https://docs.victoriametrics.com/Quick-Start.html) for additional information.
|
||||
|
||||
VictoriaMetrics can also be installed via these installation methods:
|
||||
@@ -152,7 +156,7 @@ VictoriaMetrics can also be installed via these installation methods:
|
||||
The following command-line flags are used the most:
|
||||
|
||||
* `-storageDataPath` - VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in the current working directory.
|
||||
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month. The minimum retention period is 24h or 1d. See [the Retention section](#retention) for more details.
|
||||
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month (31 days). The minimum retention period is 24h or 1d. See [these docs](#retention) for more details.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see [all the available flags with description and default values](#list-of-command-line-flags).
|
||||
|
||||
@@ -173,7 +177,8 @@ VictoriaMetrics is developed at a fast pace, so it is recommended periodically c
|
||||
|
||||
### Environment variables
|
||||
|
||||
All the VictoriaMetrics components allow referring environment variables in command-line flags via `%{ENV_VAR}` syntax.
|
||||
All the VictoriaMetrics components allow referring environment variables in `yaml` configuration files (such as `-promscrape.config`)
|
||||
and in command-line flags via `%{ENV_VAR}` syntax.
|
||||
For example, `-metricsAuthKey=%{METRICS_AUTH_KEY}` is automatically expanded to `-metricsAuthKey=top-secret`
|
||||
if `METRICS_AUTH_KEY=top-secret` environment variable exists at VictoriaMetrics startup.
|
||||
This expansion is performed by VictoriaMetrics itself.
|
||||
@@ -209,6 +214,61 @@ vi $SNAP_DATA/var/snap/victoriametrics/current/etc/victoriametrics-scrape-config
|
||||
|
||||
After changes were made, trigger config re-read with the command `curl 127.0.0.1:8428/-/reload`.
|
||||
|
||||
### Running as Windows service
|
||||
|
||||
In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw)
|
||||
and then install it as a service according to the following guide:
|
||||
|
||||
1. Create a service configuration:
|
||||
|
||||
```xml
|
||||
<service>
|
||||
<id>VictoriaMetrics</id>
|
||||
<name>VictoriaMetrics</name>
|
||||
<description>VictoriaMetrics</description>
|
||||
<executable>%BASE%\victoria-metrics-windows-amd64-prod.exe"</executable>
|
||||
|
||||
<onfailure action="restart" delay="10 sec"/>
|
||||
<onfailure action="restart" delay="20 sec"/>
|
||||
|
||||
<resetfailure>1 hour</resetfailure>
|
||||
|
||||
<arguments>-envflag.enable</arguments>
|
||||
|
||||
<priority>Normal</priority>
|
||||
|
||||
<stoptimeout>15 sec</stoptimeout>
|
||||
|
||||
<stopparentprocessfirst>true</stopparentprocessfirst>
|
||||
<startmode>Automatic</startmode>
|
||||
<waithint>15 sec</waithint>
|
||||
<sleeptime>1 sec</sleeptime>
|
||||
|
||||
<logpath>%BASE%\logs</logpath>
|
||||
<log mode="roll">
|
||||
<sizeThreshold>10240</sizeThreshold>
|
||||
<keepFiles>8</keepFiles>
|
||||
</log>
|
||||
|
||||
<env name="loggerFormat" value="json" />
|
||||
<env name="loggerOutput" value="stderr" />
|
||||
<env name="promscrape_config" value="C:\Program Files\victoria-metrics\promscrape.yml" />
|
||||
|
||||
</service>
|
||||
```
|
||||
|
||||
1. Install WinSW by following this [documentation](https://github.com/winsw/winsw#download).
|
||||
|
||||
1. Install VictoriaMetrics as a service by running the following from elevated PowerShell:
|
||||
|
||||
```console
|
||||
winsw install VictoriaMetrics.xml
|
||||
Get-Service VictoriaMetrics | Start-Service
|
||||
```
|
||||
|
||||
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3781) for more details.
|
||||
|
||||
|
||||
## Prometheus setup
|
||||
|
||||
Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`) in order to send data to VictoriaMetrics:
|
||||
@@ -270,7 +330,7 @@ too high memory consumption of Prometheus, then try to lower `max_samples_per_se
|
||||
Keep in mind that these two params are tightly connected.
|
||||
Read more about tuning remote write for Prometheus [here](https://prometheus.io/docs/practices/remote_write).
|
||||
|
||||
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer,
|
||||
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases/latest) or newer,
|
||||
since previous versions may have issues with `remote_write`.
|
||||
|
||||
Take a look also at [vmagent](https://docs.victoriametrics.com/vmagent.html)
|
||||
@@ -279,7 +339,8 @@ which can be used as faster and less resource-hungry alternative to Prometheus.
|
||||
|
||||
## Grafana setup
|
||||
|
||||
Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following url:
|
||||
Create [Prometheus datasource](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/)
|
||||
in Grafana with the following url:
|
||||
|
||||
```url
|
||||
http://<victoriametrics-addr>:8428
|
||||
@@ -293,13 +354,18 @@ or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
|
||||
Alternatively, use VictoriaMetrics [datasource plugin](https://github.com/VictoriaMetrics/grafana-datasource) with support of extra features.
|
||||
See more in [description](https://github.com/VictoriaMetrics/grafana-datasource#victoriametrics-data-source-for-grafana).
|
||||
|
||||
Creating a datasource may require [specific permissions](https://grafana.com/docs/grafana/latest/administration/data-source-management/).
|
||||
If you don't see an option to create a data source - try contacting system administrator.
|
||||
|
||||
## How to upgrade VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is developed at a fast pace, so it is recommended periodically checking [the CHANGELOG page](https://docs.victoriametrics.com/CHANGELOG.html) and performing regular upgrades.
|
||||
|
||||
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise. It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
|
||||
It is safe upgrading VictoriaMetrics to new versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
|
||||
It is safe skipping multiple versions during the upgrade unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
|
||||
It is recommended performing regular upgrades to the latest version, since it may contain important bug fixes, performance optimizations or new features.
|
||||
|
||||
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) say otherwise.
|
||||
It is also safe downgrading to older versions unless [release notes](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) say otherwise.
|
||||
|
||||
The following steps must be performed during the upgrade / downgrade procedure:
|
||||
|
||||
@@ -363,6 +429,8 @@ See the [example VMUI at VictoriaMetrics playground](https://play.victoriametric
|
||||
* queries with the biggest average execution duration;
|
||||
* queries that took the most summary time for execution.
|
||||
|
||||
This information is obtained from the `/api/v1/status/top_queries` HTTP endpoint.
|
||||
|
||||
## Active queries
|
||||
|
||||
[VMUI](#vmui) provides `active queries` tab, which shows currently execute queries.
|
||||
@@ -372,12 +440,14 @@ It provides the following information per each query:
|
||||
- The duration of the query execution.
|
||||
- The client address, who initiated the query execution.
|
||||
|
||||
This information is obtained from the `/api/v1/status/active_queries` HTTP endpoint.
|
||||
|
||||
## Metrics explorer
|
||||
|
||||
[VMUI](#vmui) provides an ability to explore metrics exported by a particular `job` / `instance` in the following way:
|
||||
|
||||
1. Open the `vmui` at `http://victoriametrics:8428/vmui/`.
|
||||
1. Click the `Explore metrics` tab.
|
||||
1. Click the `Explore Prometheus metrics` tab.
|
||||
1. Select the `job` you want to explore.
|
||||
1. Optionally select the `instance` for the selected job to explore.
|
||||
1. Select metrics you want to explore and compare.
|
||||
@@ -403,14 +473,16 @@ matching the specified [series selector](https://prometheus.io/docs/prometheus/l
|
||||
|
||||
Cardinality explorer is built on top of [/api/v1/status/tsdb](#tsdb-stats).
|
||||
|
||||
See [cardinality explorer playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/cardinality).
|
||||
See the example of using the cardinality explorer [here](https://victoriametrics.com/blog/cardinality-explorer/).
|
||||
|
||||
## Cardinality explorer statistic inaccuracy
|
||||
|
||||
In [cluster version of VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) each vmstorage tracks the stored time series individually.
|
||||
vmselect requests stats via [/api/v1/status/tsdb](#tsdb-stats) API from each vmstorage node and merges the results by summing per-series stats.
|
||||
This may lead to inflated values when samples for the same time series are spread across multiple vmstorage nodes
|
||||
due to [replication](#replication) or [rerouting](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html?highlight=re-routes#cluster-availability).
|
||||
|
||||
See [cardinality explorer playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/#/cardinality).
|
||||
See the example of using the cardinality explorer [here](https://victoriametrics.com/blog/cardinality-explorer/).
|
||||
|
||||
## How to apply new config to VictoriaMetrics
|
||||
|
||||
VictoriaMetrics is configured via command-line flags, so it must be restarted when new command-line flags should be applied:
|
||||
@@ -472,7 +544,7 @@ dd_url: http://victoriametrics:8428/datadog
|
||||
</div>
|
||||
|
||||
vmagent also can accept Datadog metrics format. Depending on where vmagent will forward data,
|
||||
pick [single-node or cluster URL]((https://docs.victoriametrics.com/url-examples.html#datadog)) formats.
|
||||
pick [single-node or cluster URL](https://docs.victoriametrics.com/url-examples.html#datadog) formats.
|
||||
|
||||
### Sending metrics to Datadog and VictoriaMetrics
|
||||
|
||||
@@ -615,6 +687,28 @@ Some plugins for Telegraf such as [fluentd](https://github.com/fangli/fluent-plu
|
||||
or [Juniper/jitmon](https://github.com/Juniper/jtimon) send `SHOW DATABASES` query to `/query` and expect a particular database name in the response.
|
||||
Comma-separated list of expected databases can be passed to VictoriaMetrics via `-influx.databaseNames` command-line flag.
|
||||
|
||||
### How to send data in InfluxDB v2 format
|
||||
|
||||
VictoriaMetrics exposes endpoint for InfluxDB v2 HTTP API at `/influx/api/v2/write` and `/api/v2/write`.
|
||||
|
||||
|
||||
In order to write data with InfluxDB line protocol to local VictoriaMetrics using `curl`:
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
curl -d 'measurement,tag1=value1,tag2=value2 field1=123,field2=1.23' -X POST 'http://localhost:8428/api/v2/write'
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
The `/api/v1/export` endpoint should return the following response:
|
||||
|
||||
```json
|
||||
{"metric":{"__name__":"measurement_field1","tag1":"value1","tag2":"value2"},"values":[123],"timestamps":[1695902762311]}
|
||||
{"metric":{"__name__":"measurement_field2","tag1":"value1","tag2":"value2"},"values":[1.23],"timestamps":[1695902762311]}
|
||||
```
|
||||
|
||||
## How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)
|
||||
|
||||
Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
|
||||
@@ -765,6 +859,79 @@ The `/api/v1/export` endpoint should return the following response:
|
||||
Extra labels may be added to all the imported time series by passing `extra_label=name=value` query args.
|
||||
For example, `/api/put?extra_label=foo=bar` would add `{foo="bar"}` label to all the ingested metrics.
|
||||
|
||||
## How to send data from NewRelic agent
|
||||
|
||||
VictoriaMetrics accepts data from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent)
|
||||
at `/newrelic/infra/v2/metrics/events/bulk` HTTP path.
|
||||
VictoriaMetrics receives [Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
|
||||
from NewRelic agent at the given path, transforms them to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
|
||||
according to [these docs](#newrelic-agent-data-mapping) before storing the raw samples to the database.
|
||||
|
||||
You need passing `COLLECTOR_URL` and `NRIA_LICENSE_KEY` environment variables to NewRelic infrastructure agent in order to send the collected metrics to VictoriaMetrics.
|
||||
The `COLLECTOR_URL` must point to `/newrelic` HTTP endpoint at VictoriaMetrics, while the `NRIA_LICENSE_KEY` must contain NewRelic license key,
|
||||
which can be obtained [here](https://newrelic.com/signup).
|
||||
For example, if VictoriaMetrics runs at `localhost:8428`, then the following command can be used for running NewRelic infrastructure agent:
|
||||
|
||||
```console
|
||||
COLLECTOR_URL="http://localhost:8428/newrelic" NRIA_LICENSE_KEY="NEWRELIC_LICENSE_KEY" ./newrelic-infra
|
||||
```
|
||||
|
||||
### NewRelic agent data mapping
|
||||
|
||||
VictoriaMetrics maps [NewRelic Events](https://docs.newrelic.com/docs/infrastructure/manage-your-data/data-instrumentation/default-infrastructure-monitoring-data/#infrastructure-events)
|
||||
to [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) in the following way:
|
||||
|
||||
1. Every numeric field is converted into a raw sample with the corresponding name.
|
||||
1. The `eventType` and all the other fields with `string` value type are attached to every raw sample as [metric labels](https://docs.victoriametrics.com/keyConcepts.html#labels).
|
||||
1. The `timestamp` field is used as timestamp for the ingested [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples).
|
||||
The `timestamp` field may be specified either in seconds or in milliseconds since the [Unix Epoch](https://en.wikipedia.org/wiki/Unix_time).
|
||||
If the `timestamp` field is missing, then the raw sample is stored with the current timestamp.
|
||||
|
||||
For example, let's import the following NewRelic Events request to VictoriaMetrics:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"Events":[
|
||||
{
|
||||
"eventType":"SystemSample",
|
||||
"entityKey":"macbook-pro.local",
|
||||
"cpuPercent":25.056660790748904,
|
||||
"cpuUserPercent":8.687987912389374,
|
||||
"cpuSystemPercent":16.36867287835953,
|
||||
"cpuIOWaitPercent":0,
|
||||
"cpuIdlePercent":74.94333920925109,
|
||||
"cpuStealPercent":0,
|
||||
"loadAverageOneMinute":5.42333984375,
|
||||
"loadAverageFiveMinute":4.099609375,
|
||||
"loadAverageFifteenMinute":3.58203125
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Save this JSON into `newrelic.json` file and then use the following command in order to import it into VictoriaMetrics:
|
||||
|
||||
```console
|
||||
curl -X POST -H 'Content-Type: application/json' --data-binary @newrelic.json http://localhost:8428/newrelic/infra/v2/metrics/events/bulk
|
||||
```
|
||||
|
||||
Let's fetch the ingested data via [data export API](#how-to-export-data-in-json-line-format):
|
||||
|
||||
```console
|
||||
curl http://localhost:8428/api/v1/export -d 'match={eventType="SystemSample"}'
|
||||
{"metric":{"__name__":"cpuStealPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"loadAverageFiveMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[4.099609375],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"cpuIOWaitPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[0],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"cpuSystemPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[16.368672878359],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"loadAverageOneMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[5.42333984375],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"cpuUserPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[8.687987912389],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"cpuIdlePercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[74.9433392092],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"loadAverageFifteenMinute","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[3.58203125],"timestamps":[1697407970000]}
|
||||
{"metric":{"__name__":"cpuPercent","entityKey":"macbook-pro.local","eventType":"SystemSample"},"values":[25.056660790748],"timestamps":[1697407970000]}
|
||||
```
|
||||
|
||||
## Prometheus querying API usage
|
||||
|
||||
VictoriaMetrics supports the following handlers from [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/):
|
||||
@@ -829,7 +996,7 @@ Additionally, VictoriaMetrics provides the following handlers:
|
||||
* `/api/v1/series/count` - returns the total number of time series in the database. Some notes:
|
||||
* the handler scans all the inverted index, so it can be slow if the database contains tens of millions of time series;
|
||||
* the handler may count [deleted time series](#how-to-delete-time-series) additionally to normal time series due to internal implementation restrictions;
|
||||
* `/api/v1/status/active_queries` - returns a list of currently running queries.
|
||||
* `/api/v1/status/active_queries` - returns the list of currently running queries. This list is also available at [`active queries` page at VMUI](#active-queries).
|
||||
* `/api/v1/status/top_queries` - returns the following query lists:
|
||||
* the most frequently executed queries - `topByCount`
|
||||
* queries with the biggest average execution duration - `topByAvgDuration`
|
||||
@@ -839,6 +1006,8 @@ Additionally, VictoriaMetrics provides the following handlers:
|
||||
For example, request to `/api/v1/status/top_queries?topN=5&maxLifetime=30s` would return up to 5 queries per list, which were executed during the last 30 seconds.
|
||||
VictoriaMetrics tracks the last `-search.queryStats.lastQueriesCount` queries with durations at least `-search.queryStats.minQueryDuration`.
|
||||
|
||||
See also [`top queries` page at VMUI](#top-queries).
|
||||
|
||||
### Timestamp formats
|
||||
|
||||
VictoriaMetrics accepts the following formats for `time`, `start` and `end` query args
|
||||
@@ -907,14 +1076,14 @@ VictoriaMetrics supports the following handlers from [Graphite Tags API](https:/
|
||||
|
||||
## How to build from sources
|
||||
|
||||
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
|
||||
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) or
|
||||
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) instead of building VictoriaMetrics
|
||||
from sources. Building from sources is reasonable when developing additional features specific
|
||||
to your needs or when testing bugfixes.
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make victoria-metrics` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -930,7 +1099,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make victoria-metrics-linux-arm` or `make victoria-metrics-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-linux-arm` or `victoria-metrics-linux-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
@@ -944,7 +1113,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
`Pure Go` mode builds only Go code without [cgo](https://golang.org/cmd/cgo/) dependencies.
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make victoria-metrics-pure` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-pure` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -962,6 +1131,18 @@ For example, the following command builds the image on top of [scratch](https://
|
||||
ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
```
|
||||
|
||||
#### Building VictoriaMetrics with Podman
|
||||
|
||||
VictoriaMetrics can be built with Podman in either rootful or rootless mode.
|
||||
|
||||
When building via rootlful Podman, simply add `DOCKER=podman` to the relevant `make` commandline. To build
|
||||
via rootless Podman, add `DOCKER=podman DOCKER_RUN="podman run --userns=keep-id"` to the `make`
|
||||
commandline.
|
||||
|
||||
For example: `make victoria-metrics-pure DOCKER=podman DOCKER_RUN="podman run --userns=keep-id"`
|
||||
|
||||
Note that `production` builds are not supported via Podman becuase Podman does not support `buildx`.
|
||||
|
||||
## Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
@@ -988,6 +1169,15 @@ Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-s
|
||||
is the command-line flag value. Snapshots can be archived to backup storage at any time
|
||||
with [vmbackup](https://docs.victoriametrics.com/vmbackup.html).
|
||||
|
||||
Snapshots consist of a mix of hard-links and soft-links to various files and directories inside `-storageDataPath`.
|
||||
See [this article](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
|
||||
for more details. This adds some restrictions on what can be done with the contents of `<-storageDataPath>/snapshots` directory:
|
||||
|
||||
- Do not delete subdirectories inside `<-storageDataPath>/snapshots` with `rm` or similar commands, since this will leave some snapshot data undeleted.
|
||||
Prefer using the `/snapshot/delete` API for deleting snapshot. See below for more details about this API.
|
||||
- Do not copy subdirectories inside `<-storageDataPath>/snapshot` with `cp`, `rsync` or similar commands, since there are high chances
|
||||
that these commands won't copy some data stored in the snapshot. Prefer using [vmbackup](https://docs.victoriametrics.com/vmbackup.html) for making copies of snapshot data.
|
||||
|
||||
The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
|
||||
|
||||
Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete?snapshot=<snapshot-name>` in order
|
||||
@@ -1065,7 +1255,9 @@ VictoriaMetrics provides the following handlers for exporting data:
|
||||
Send a request to `http://<victoriametrics-addr>:8428/api/v1/export?match[]=<timeseries_selector_for_export>`,
|
||||
where `<timeseries_selector_for_export>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
|
||||
for metrics to export. Use `{__name__!=""}` selector for fetching all the time series.
|
||||
The response would contain all the data for the selected time series in [JSON streaming format](http://ndjson.org/).
|
||||
|
||||
The response would contain all the data for the selected time series in JSON line format - see [these docs](#json-line-format) for details on this format.
|
||||
|
||||
Each JSON line contains samples for a single time series. An example output:
|
||||
|
||||
```json
|
||||
@@ -1195,6 +1387,8 @@ check for changes in `vm_rows_invalid_total` (exported by server side) metric.
|
||||
|
||||
### How to import data in JSON line format
|
||||
|
||||
VictoriaMetrics accepts metrics data in JSON line format at `/api/v1/import` endpoint. See [these docs](#json-line-format) for details on this format.
|
||||
|
||||
Example for importing data obtained via [/api/v1/export](#how-to-export-data-in-json-line-format):
|
||||
|
||||
```console
|
||||
@@ -1359,13 +1553,54 @@ Note that it could be required to flush response cache after importing historica
|
||||
|
||||
VictoriaMetrics also may scrape Prometheus targets - see [these docs](#how-to-scrape-prometheus-exporters-such-as-node-exporter).
|
||||
|
||||
## Sending data via OpenTelemetry
|
||||
### Sending data via OpenTelemetry
|
||||
|
||||
VictoriaMetrics supports data ingestion via [OpenTelemetry protocol for metrics](https://github.com/open-telemetry/opentelemetry-specification/blob/ffddc289462dfe0c2041e3ca42a7b1df805706de/specification/metrics/data-model.md) at `/opentelemetry/api/v1/push` path.
|
||||
|
||||
VictoriaMetrics expects `protobuf`-encoded requests at `/opentelemetry/api/v1/push`.
|
||||
Set HTTP request header `Content-Encoding: gzip` when sending gzip-compressed data to `/opentelemetry/api/v1/push`.
|
||||
|
||||
## JSON line format
|
||||
|
||||
VictoriaMetrics accepts data in JSON line format at [/api/v1/import](#how-to-import-data-in-json-line-format)
|
||||
and exports data in this format at [/api/v1/export](#how-to-export-data-in-json-line-format).
|
||||
|
||||
The format follows [JSON streaming concept](http://ndjson.org/), e.g. each line contains JSON object with metrics data in the following format:
|
||||
|
||||
```
|
||||
{
|
||||
// metric contans metric name plus labels for a particular time series
|
||||
"metric":{
|
||||
"__name__": "metric_name", // <- this is metric name
|
||||
|
||||
// Other labels for the time series
|
||||
|
||||
"label1": "value1",
|
||||
"label2": "value2",
|
||||
...
|
||||
"labelN": "valueN"
|
||||
},
|
||||
|
||||
// values contains raw sample values for the given time series
|
||||
"values": [1, 2.345, -678],
|
||||
|
||||
// timestamps contains raw sample UNIX timestamps in milliseconds for the given time series
|
||||
// every timestamp is associated with the value at the corresponding position
|
||||
"timestamps": [1549891472010,1549891487724,1549891503438]
|
||||
}
|
||||
```
|
||||
|
||||
Note that every JSON object must be written in a single line, e.g. all the newline chars must be removed from it.
|
||||
Every line length is limited by the value passed to `-import.maxLineLen` command-line flag (by default this is 100MB).
|
||||
|
||||
It is recommended passing 1K-10K samples per line for achieving the maximum data ingestion performance at [/api/v1/import](#how-to-import-data-in-json-line-format).
|
||||
Too long JSON lines may increase RAM usage at VictoriaMetrics side.
|
||||
|
||||
It is OK to split [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
|
||||
for the same [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) across multiple lines.
|
||||
|
||||
The number of lines in JSON line document can be arbitrary.
|
||||
|
||||
## Relabeling
|
||||
|
||||
VictoriaMetrics supports Prometheus-compatible relabeling for all the ingested metrics if `-relabelConfig` command-line flag points
|
||||
@@ -1465,43 +1700,44 @@ See also [cardinality limiter](#cardinality-limiter) and [capacity planning docs
|
||||
|
||||
## High availability
|
||||
|
||||
* Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
* Pass addresses of these instances to [vmagent](https://docs.victoriametrics.com/vmagent.html) via `-remoteWrite.url` command-line flag:
|
||||
The general approach for achieving high availability is the following:
|
||||
|
||||
- to run two identically configured VictoriaMetrics instances in distinct datacenters (availability zones)
|
||||
- to store the collected data simultaneously into these instances via [vmagent](https://docs.victoriametrics.com/vmagent.html) or Prometheus
|
||||
- to query the first VictoriaMetrics instance and to fail over to the second instance when the first instance becomes temporarily unavailable.
|
||||
|
||||
Such a setup guarantees that the collected data isn't lost when one of VictoriaMetrics instance becomes unavailable.
|
||||
The collected data continues to be written to the available VictoriaMetrics instance, so it should be available for querying.
|
||||
Both [vmagent](https://docs.victoriametrics.com/vmagent.html) and Prometheus buffer the collected data locally if they cannot send it
|
||||
to the configured remote storage. So the collected data will be written to the temporarily unavailable VictoriaMetrics instance
|
||||
after it becomes available.
|
||||
|
||||
If you use [vmagent](https://docs.victoriametrics.com/vmagent.html) for storing the data into VictoriaMetrics,
|
||||
then it can be configured with multiple `-remoteWrite.url` command-line flags, where every flag points to the VictoriaMetrics
|
||||
instance in a particular availability zone, in order to replicate the collected data to all the VictoriaMetrics instances.
|
||||
For example, the following command instructs `vmagent` to replicate data to `vm-az1` and `vm-az2` instances of VictoriaMetrics:
|
||||
|
||||
```console
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
/path/to/vmagent \
|
||||
-remoteWrite.url=http://<vm-az1>:8428/api/v1/write \
|
||||
-remoteWrite.url=http://<vm-az2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
Alternatively these addresses may be passed to `remote_write` section in Prometheus config:
|
||||
If you use Prometheus for collecting and writing the data to VictoriaMetrics,
|
||||
then the following [`remote_write`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section
|
||||
in Prometheus config can be used for replicating the collected data to `vm-az1` and `vm-az2` VictoriaMetrics instances:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
- url: http://<victoriametrics-addr-1>:8428/api/v1/write
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
# ...
|
||||
- url: http://<victoriametrics-addr-N>:8428/api/v1/write
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
- url: http://<vm-az1>:8428/api/v1/write
|
||||
- url: http://<vm-az2>:8428/api/v1/write
|
||||
```
|
||||
|
||||
* Apply the updated config:
|
||||
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups,
|
||||
since it uses lower amounts of RAM, CPU and network bandwidth than Prometheus.
|
||||
|
||||
```console
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups.
|
||||
|
||||
* Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
* Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
* Set up Prometheus datasource in Grafana that points to Promxy.
|
||||
|
||||
If you have Prometheus HA pairs with replicas `r1` and `r2` in each pair, then configure each `r1`
|
||||
to write data to `victoriametrics-addr-1`, while each `r2` should write data to `victoriametrics-addr-2`.
|
||||
|
||||
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
|
||||
with the enabled de-duplication. See [this section](#deduplication) for details.
|
||||
If you use identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) instances for collecting the same data
|
||||
and sending it to VictoriaMetrics, then do not forget enabling [deduplication](#deduplication) at VictoriaMetrics side.
|
||||
|
||||
## Deduplication
|
||||
|
||||
@@ -1609,8 +1845,8 @@ See also [how to work with snapshots](#how-to-work-with-snapshots).
|
||||
## Retention
|
||||
|
||||
Retention is configured with the `-retentionPeriod` command-line flag, which takes a number followed by a time unit
|
||||
character - `h(ours)`, `d(ays)`, `w(eeks)`, `y(ears)`. If the time unit is not specified, a month is assumed.
|
||||
For instance, `-retentionPeriod=3` means that the data will be stored for 3 months and then deleted.
|
||||
character - `h(ours)`, `d(ays)`, `w(eeks)`, `y(ears)`. If the time unit is not specified, a month (31 days) is assumed.
|
||||
For instance, `-retentionPeriod=3` means that the data will be stored for 3 months (93 days) and then deleted.
|
||||
The default retention period is one month. The **minimum retention** period is 24h or 1d.
|
||||
|
||||
Data is split in per-month partitions inside `<-storageDataPath>/data/{small,big}` folders.
|
||||
@@ -1653,9 +1889,10 @@ See [these docs](https://docs.victoriametrics.com/guides/guide-vmcluster-multipl
|
||||
which allow configuring multiple retentions for distinct sets of time series matching the configured [series filters](https://docs.victoriametrics.com/keyConcepts.html#filtering)
|
||||
via `-retentionFilter` command-line flag. This flag accepts `filter:duration` options, where `filter` must be
|
||||
a valid [series filter](https://docs.victoriametrics.com/keyConcepts.html#filtering), while the `duration`
|
||||
must contain valid [retention](#retention) for time series matching the given `filter`. If series doesn't match
|
||||
any configured `-retentionFilter`, then the retention configured via [-retentionPeriod](#retention) command-line flag is applied to it.
|
||||
If series matches multiple configured retention filters, then the smallest retention is applied.
|
||||
must contain valid [retention](#retention) for time series matching the given `filter`.
|
||||
The `duration` of the `-retentionFilter` must be lower or equal to [-retentionPeriod](#retention) flag value.
|
||||
If series doesn't match any configured `-retentionFilter`, then the retention configured via [-retentionPeriod](#retention)
|
||||
command-line flag is applied to it. If series matches multiple configured retention filters, then the smallest retention is applied.
|
||||
|
||||
For example, the following config sets 3 days retention for time series with `team="juniors"` label,
|
||||
30 days retention for time series with `env="dev"` or `env="staging"` label and 1 year retention for the remaining time series:
|
||||
@@ -1676,7 +1913,8 @@ to historical data.
|
||||
|
||||
See [how to configure multiple retentions in VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#retention-filters).
|
||||
|
||||
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
|
||||
Retention filters can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
|
||||
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
## Downsampling
|
||||
|
||||
@@ -1686,14 +1924,34 @@ Retention filters can be evaluated for free by downloading and using enterprise
|
||||
|
||||
* `-downsampling.period=30d:5m,180d:1h` instructs VictoriaMetrics to deduplicate samples older than 30 days with 5 minutes interval and to deduplicate samples older than 180 days with 1 hour interval.
|
||||
|
||||
Downsampling is applied independently per each time series. It can reduce disk space usage and improve query performance if it is applied to time series with big number of samples per each series. The downsampling doesn't improve query performance if the database contains big number of time series with small number of samples per each series (aka [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)), since downsampling doesn't reduce the number of time series. So the majority of time is spent on searching for the matching time series.
|
||||
It is possible to use [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html) in vmagent or recording rules in [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to [reduce the number of time series](https://docs.victoriametrics.com/vmalert.html#downsampling-and-aggregation-via-vmalert).
|
||||
Downsampling is applied independently per each time series and leaves a single [raw sample](https://docs.victoriametrics.com/keyConcepts.html#raw-samples)
|
||||
with the biggest [timestamp](https://en.wikipedia.org/wiki/Unix_time) on the configured interval, in the same way as [deduplication](#deduplication) does.
|
||||
It works the best for [counters](https://docs.victoriametrics.com/keyConcepts.html#counter) and [histograms](https://docs.victoriametrics.com/keyConcepts.html#histogram),
|
||||
as their values are always increasing. But downsampling [gauges](https://docs.victoriametrics.com/keyConcepts.html#gauge)
|
||||
and [summaries](https://docs.victoriametrics.com/keyConcepts.html#summary)
|
||||
would mean losing the changes within the downsampling interval. Please note, you can use [recording rules](https://docs.victoriametrics.com/vmalert.html#rules)
|
||||
or [steaming aggregation](https://docs.victoriametrics.com/stream-aggregation.html)
|
||||
to apply custom aggregation functions, like min/max/avg etc., in order to make gauges more resilient to downsampling.
|
||||
|
||||
Downsampling can reduce disk space usage and improve query performance if it is applied to time series with big number
|
||||
of samples per each series. The downsampling doesn't improve query performance if the database contains big number
|
||||
of time series with small number of samples per each series (aka [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate)),
|
||||
since downsampling doesn't reduce the number of time series. In this case the majority of query time is spent on searching for the matching time series
|
||||
instead of processing the found samples.
|
||||
It is possible to use [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html) in [vmagent](https://docs.victoriametrics.com/vmagent.html)
|
||||
or recording rules in [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to
|
||||
[reduce the number of time series](https://docs.victoriametrics.com/vmalert.html#downsampling-and-aggregation-via-vmalert).
|
||||
|
||||
Downsampling happens during [background merges](https://docs.victoriametrics.com/#storage)
|
||||
and can't be performed if there is not enough of free disk space or if vmstorage
|
||||
is in [read-only mode](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#readonly-mode).
|
||||
|
||||
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
|
||||
Please, note that intervals of `-downsampling.period` must be multiples of each other.
|
||||
In case [deduplication](https://docs.victoriametrics.com/#deduplication) is enabled value of `-dedup.minScrapeInterval` must also be multiple of `-downsampling.period` intervals.
|
||||
This is required to ensure consistency of deduplication and downsampling results.
|
||||
|
||||
The downsampling can be evaluated for free by downloading and using enterprise binaries from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
|
||||
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
## Multi-tenancy
|
||||
|
||||
@@ -1740,6 +1998,8 @@ VictoriaMetrics provides the following security-related command-line flags:
|
||||
* `-flagsAuthKey` for protecting `/flags` endpoint.
|
||||
* `-pprofAuthKey` for protecting `/debug/pprof/*` endpoints, which can be used for [profiling](#profiling).
|
||||
* `-denyQueryTracing` for disallowing [query tracing](#query-tracing).
|
||||
* `-http.header.hsts`, `-http.header.csp`, and `-http.header.frameOptions` for serving `Strict-Transport-Security`, `Content-Security-Policy`
|
||||
and `X-Frame-Options` HTTP response headers.
|
||||
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`. This protects from unexpected requests from untrusted network interfaces.
|
||||
@@ -1756,7 +2016,7 @@ and [the general security page at VictoriaMetrics website](https://victoriametri
|
||||
The only option is increasing the limit on [the number of open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a).
|
||||
The recommendation is not specific for VictoriaMetrics only but also for any service which handles many HTTP connections and stores data on disk.
|
||||
* VictoriaMetrics is a write-heavy application and its performance depends on disk performance. So be careful with other
|
||||
applications or utilities (like [fstrim](http://manpages.ubuntu.com/manpages/bionic/man8/fstrim.8.html))
|
||||
applications or utilities (like [fstrim](https://manpages.ubuntu.com/manpages/lunar/en/man8/fstrim.8.html))
|
||||
which could [exhaust disk resources](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1521).
|
||||
* The recommended filesystem is `ext4`, the recommended persistent storage is [persistent HDD-based disk on GCP](https://cloud.google.com/compute/docs/disks/#pdspecs),
|
||||
since it is protected from hardware failures via internal replication and it can be [resized on the fly](https://cloud.google.com/compute/docs/disks/add-persistent-disk#resize_pd).
|
||||
@@ -1785,9 +2045,9 @@ Graphs on the dashboards contain useful hints - hover the `i` icon in the top le
|
||||
We recommend setting up [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts)
|
||||
via [vmalert](https://docs.victoriametrics.com/vmalert.html) or via Prometheus.
|
||||
|
||||
VictoriaMetrics exposes currently running queries and their execution times at `/api/v1/status/active_queries` page.
|
||||
VictoriaMetrics exposes currently running queries and their execution times at [`active queries` page](#active-queries).
|
||||
|
||||
VictoriaMetrics exposes queries, which take the most time to execute, at `/api/v1/status/top_queries` page.
|
||||
VictoriaMetrics exposes queries, which take the most time to execute, at [`top queries` page](#top-queries).
|
||||
|
||||
See also [VictoriaMetrics Monitoring](https://victoriametrics.com/blog/victoriametrics-monitoring/)
|
||||
and [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html).
|
||||
@@ -1922,7 +2182,7 @@ and [cardinality explorer docs](#cardinality-explorer).
|
||||
|
||||
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
|
||||
|
||||
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest),
|
||||
since the encountered issue could be already fixed there.
|
||||
|
||||
* It is recommended to have at least 50% of spare resources for CPU, disk IO and RAM, so VictoriaMetrics could handle short spikes in the workload without performance issues.
|
||||
@@ -1932,9 +2192,11 @@ and [cardinality explorer docs](#cardinality-explorer).
|
||||
has at least 20% of free space. The remaining amount of free space
|
||||
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
|
||||
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
|
||||
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
|
||||
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
|
||||
which would start background merge if they had more free disk space.
|
||||
|
||||
* If you run VictoriaMetrics on a host with 16 or more CPU cores, then it may be needed to tune the `-search.maxWorkersPerQuery` command-line flag
|
||||
in order to improve query performance. If VictoriaMetrics serves big number of concurrent `select` queries, then try reducing the value for this flag.
|
||||
If VcitoriaMetrics serves heavy queries, which select `>10K` of [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) and/or process `>100M`
|
||||
of [raw samples](https://docs.victoriametrics.com/keyConcepts.html#raw-samples) per query, then try setting the value for this flag to the number of available CPU cores.
|
||||
|
||||
* VictoriaMetrics buffers incoming data in memory for up to a few seconds before flushing it to persistent storage.
|
||||
This may lead to the following "issues":
|
||||
@@ -1972,15 +2234,19 @@ and [cardinality explorer docs](#cardinality-explorer).
|
||||
This suppresses default gap filling algorithm used by VictoriaMetrics - by default it assumes
|
||||
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
|
||||
|
||||
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality) or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
|
||||
* Metrics and labels leading to [high cardinality](https://docs.victoriametrics.com/FAQ.html#what-is-high-cardinality)
|
||||
or [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate) can be determined
|
||||
via [cardinality explorer](#cardinality-explorer) and via [/api/v1/status/tsdb](#tsdb-stats) endpoint.
|
||||
|
||||
* New time series can be logged if `-logNewSeries` command-line flag is passed to VictoriaMetrics.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag
|
||||
and drops superflouos labels. This prevents from ingesting metrics with too many labels.
|
||||
It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
metric in order to determine whether `-maxLabelsPerTimeseries` must be adjusted for your workload.
|
||||
|
||||
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics. See [these docs](#selecting-graphite-metrics) for details.
|
||||
* If you store Graphite metrics like `foo.bar.baz` in VictoriaMetrics, then `{__graphite__="foo.*.baz"}` filter can be used for selecting such metrics.
|
||||
See [these docs](#selecting-graphite-metrics) for details. You can also query Graphite metrics with [Graphite querying API](#graphite-render-api-usage).
|
||||
|
||||
* VictoriaMetrics ignores `NaN` values during data ingestion.
|
||||
|
||||
@@ -2122,7 +2388,8 @@ See also [high availability docs](#high-availability) and [backup docs](#backups
|
||||
VictoriaMetrics supports backups via [vmbackup](https://docs.victoriametrics.com/vmbackup.html)
|
||||
and [vmrestore](https://docs.victoriametrics.com/vmrestore.html) tools.
|
||||
We also provide [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html) tool for enterprise subscribers.
|
||||
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
|
||||
Enterprise binaries can be downloaded and evaluated for free from [the releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest).
|
||||
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
## vmalert
|
||||
|
||||
@@ -2197,12 +2464,14 @@ Contact us with any questions regarding VictoriaMetrics at [info@victoriametrics
|
||||
|
||||
Feel free asking any questions regarding VictoriaMetrics:
|
||||
|
||||
* [slack](https://slack.victoriametrics.com/)
|
||||
* [linkedin](https://www.linkedin.com/company/victoriametrics/)
|
||||
* [reddit](https://www.reddit.com/r/VictoriaMetrics/)
|
||||
* [telegram-en](https://t.me/VictoriaMetrics_en)
|
||||
* [telegram-ru](https://t.me/VictoriaMetrics_ru1)
|
||||
* [google groups](https://groups.google.com/forum/#!forum/victorametrics-users)
|
||||
* [Slack](https://slack.victoriametrics.com/)
|
||||
* [Twitter](https://twitter.com/VictoriaMetrics/)
|
||||
* [Linkedin](https://www.linkedin.com/company/victoriametrics/)
|
||||
* [Reddit](https://www.reddit.com/r/VictoriaMetrics/)
|
||||
* [Telegram-en](https://t.me/VictoriaMetrics_en)
|
||||
* [Telegram-ru](https://t.me/VictoriaMetrics_ru1)
|
||||
* [Google groups](https://groups.google.com/forum/#!forum/victorametrics-users)
|
||||
* [Mastodon](https://mastodon.social/@victoriametrics/)
|
||||
|
||||
If you like VictoriaMetrics and want to contribute, then we need the following:
|
||||
|
||||
@@ -2262,6 +2531,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
```
|
||||
-bigMergeConcurrency int
|
||||
Deprecated: this flag does nothing. Please use -smallMergeConcurrency for controlling the concurrency of background merges. See https://docs.victoriametrics.com/#storage
|
||||
-blockcache.missesBeforeCaching int
|
||||
The number of cache misses before putting the block into cache. Higher values may reduce indexdb/dataBlocks cache size at the cost of higher CPU and disk read usage (default 2)
|
||||
-cacheExpireDuration duration
|
||||
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
|
||||
-configAuthKey string
|
||||
@@ -2293,7 +2564,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-filestream.disableFadvise
|
||||
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
|
||||
-finalMergeDelay duration
|
||||
The delay before starting final merge for per-month partition after no new data is ingested into it. Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. Zero value disables final merge
|
||||
-flagsAuthKey string
|
||||
@@ -2314,6 +2587,12 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
|
||||
-http.header.csp string
|
||||
Value for 'Content-Security-Policy' header
|
||||
-http.header.frameOptions string
|
||||
Value for 'X-Frame-Options' header
|
||||
-http.header.hsts string
|
||||
Value for 'Strict-Transport-Security' header
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
@@ -2363,6 +2642,12 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
|
||||
-internStringMaxLen int
|
||||
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
|
||||
-license string
|
||||
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
|
||||
-license.forceOffline
|
||||
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
|
||||
-licenseFile string
|
||||
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
|
||||
-logNewSeries
|
||||
Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics
|
||||
-loggerDisableTimestamps
|
||||
@@ -2375,6 +2660,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Allows renaming fields in JSON formatted logs. Example: "ts:timestamp,msg:message" renames "ts" to "timestamp" and "msg" to "message". Supported fields: ts, level, caller, msg
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerMaxArgLen int
|
||||
The maximum length of a single logged argument. Longer arguments are replaced with 'arg_start..arg_end', where 'arg_start' and 'arg_end' is prefix and suffix of the arg with the length not exceeding -loggerMaxArgLen / 2 (default 500)
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
@@ -2382,7 +2669,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxConcurrentInserts int
|
||||
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
|
||||
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
|
||||
-maxInsertRequestSize size
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
|
||||
@@ -2397,6 +2684,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-newrelic.maxInsertRequestSize size
|
||||
The maximum size in bytes of a single NewRelic request to /newrelic/infra/v2/metrics/events/bulk
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864)
|
||||
-opentsdbHTTPListenAddr string
|
||||
TCP address to listen for OpenTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty. See also -opentsdbHTTPListenAddr.useProxyProtocol
|
||||
-opentsdbHTTPListenAddr.useProxyProtocol
|
||||
@@ -2420,14 +2710,16 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1)
|
||||
-promscrape.azureSDCheckInterval duration
|
||||
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
|
||||
-promscrape.cluster.memberLabel string
|
||||
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
|
||||
-promscrape.cluster.memberNum string
|
||||
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
|
||||
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
|
||||
-promscrape.cluster.membersCount int
|
||||
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
|
||||
-promscrape.cluster.name string
|
||||
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
|
||||
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
|
||||
-promscrape.cluster.replicationFactor int
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
|
||||
-promscrape.config string
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
-promscrape.config.dryRun
|
||||
@@ -2435,7 +2727,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
-promscrape.config.strictParse
|
||||
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
|
||||
-promscrape.configCheckInterval duration
|
||||
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
Interval for checking for changes in -promscrape.config file. By default, the checking is disabled. See how to reload -promscrape.config file at https://docs.victoriametrics.com/vmagent.html#configuration-update
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
@@ -2522,7 +2814,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-retentionPeriod value
|
||||
Data with timestamps outside the retentionPeriod is automatically deleted. The minimum retentionPeriod is 24h or 1d. See also -retentionFilter
|
||||
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
|
||||
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
|
||||
-retentionTimezoneOffset duration
|
||||
The offset for performing indexdb rotation. If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)
|
||||
-search.cacheTimestampOffset duration
|
||||
@@ -2538,7 +2830,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
-search.latencyOffset duration
|
||||
The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
|
||||
-search.logQueryMemoryUsage size
|
||||
Log queries, which require more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
|
||||
Log query and increment vm_memory_intensive_queries_total metric each time the query requires more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
|
||||
-search.logSlowQueryDuration duration
|
||||
Log queries with execution time exceeding this value. Zero disables slow query logging. See also -search.logQueryMemoryUsage (default 5s)
|
||||
@@ -2596,8 +2888,13 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
The maximum number of tag values returned from /api/v1/label/<label_name>/values (default 100000)
|
||||
-search.maxUniqueTimeseries int
|
||||
The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage (default 300000)
|
||||
-search.maxWorkersPerQuery int
|
||||
The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4)
|
||||
-search.minStalenessInterval duration
|
||||
The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval'
|
||||
-search.minWindowForInstantRollupOptimization value
|
||||
Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value
|
||||
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h)
|
||||
-search.noStaleMarkers
|
||||
Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets
|
||||
-search.queryStats.lastQueriesCount int
|
||||
@@ -2624,7 +2921,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
|
||||
The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
|
||||
-snapshotsMaxAge value
|
||||
Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
|
||||
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
|
||||
The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
|
||||
-sortLabels
|
||||
Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit
|
||||
-storage.cacheSizeIndexDBDataBlocks size
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
| Version | Supported |
|
||||
|---------|--------------------|
|
||||
| [latest release](https://docs.victoriametrics.com/CHANGELOG.html) | :white_check_mark: |
|
||||
| v1.93.x LTS release | :white_check_mark: |
|
||||
| v1.87.x LTS release | :white_check_mark: |
|
||||
| v1.79.x LTS release | :white_check_mark: |
|
||||
| other releases | :x: |
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
@@ -3,6 +3,7 @@ package elasticsearch
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
@@ -11,6 +12,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
|
||||
@@ -21,7 +24,10 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
elasticsearchVersion = flag.String("elasticsearch.version", "8.9.0", "Elasticsearch version to report to client")
|
||||
)
|
||||
|
||||
// RequestHandler processes Elasticsearch insert requests
|
||||
@@ -60,9 +66,9 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
|
||||
// See the latest available version for Elasticsearch at https://github.com/elastic/elasticsearch/releases
|
||||
fmt.Fprintf(w, `{
|
||||
"version": {
|
||||
"number": "8.8.0"
|
||||
"number": %q
|
||||
}
|
||||
}`)
|
||||
}`, *elasticsearchVersion)
|
||||
case http.MethodHead:
|
||||
// Return empty response for Logstash ping request.
|
||||
}
|
||||
@@ -88,22 +94,32 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
if err := vlstorage.CanWriteData(); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
|
||||
processLogMessage := cp.GetProcessLogMessageFunc(lr)
|
||||
isGzip := r.Header.Get("Content-Encoding") == "gzip"
|
||||
n, err := readBulkRequest(r.Body, isGzip, cp.TimeField, cp.MsgField, processLogMessage)
|
||||
vlstorage.MustAddRows(lr)
|
||||
logstorage.PutLogRows(lr)
|
||||
if err != nil {
|
||||
logger.Warnf("cannot decode log message #%d in /_bulk request: %s", n, err)
|
||||
return true
|
||||
}
|
||||
vlstorage.MustAddRows(lr)
|
||||
logstorage.PutLogRows(lr)
|
||||
|
||||
tookMs := time.Since(startTime).Milliseconds()
|
||||
bw := bufferedwriter.Get(w)
|
||||
defer bufferedwriter.Put(bw)
|
||||
WriteBulkResponse(bw, n, tookMs)
|
||||
_ = bw.Flush()
|
||||
|
||||
// update bulkRequestDuration only for successfully parsed requests
|
||||
// There is no need in updating bulkRequestDuration for request errors,
|
||||
// since their timings are usually much smaller than the timing for successful request parsing.
|
||||
bulkRequestDuration.UpdateDuration(startTime)
|
||||
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
@@ -111,7 +127,9 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
|
||||
var (
|
||||
bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
|
||||
bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
|
||||
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
|
||||
bulkRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/elasticsearch/_bulk"}`)
|
||||
)
|
||||
|
||||
func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
|
||||
@@ -157,8 +175,6 @@ func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
|
||||
|
||||
var lineBufferPool bytesutil.ByteBufferPool
|
||||
|
||||
var rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
|
||||
|
||||
func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
|
||||
processLogMessage func(timestamp int64, fields []logstorage.Field),
|
||||
) (bool, error) {
|
||||
@@ -209,6 +225,7 @@ func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
|
||||
p.RenameField(msgField, "_msg")
|
||||
processLogMessage(ts, p.Fields)
|
||||
logjson.PutParser(p)
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -120,10 +120,10 @@ func compressData(s string) string {
|
||||
var bb bytes.Buffer
|
||||
zw := gzip.NewWriter(&bb)
|
||||
if _, err := zw.Write([]byte(s)); err != nil {
|
||||
panic(fmt.Errorf("unexpected error when compressing data: %s", err))
|
||||
panic(fmt.Errorf("unexpected error when compressing data: %w", err))
|
||||
}
|
||||
if err := zw.Close(); err != nil {
|
||||
panic(fmt.Errorf("unexpected error when closing gzip writer: %s", err))
|
||||
panic(fmt.Errorf("unexpected error when closing gzip writer: %w", err))
|
||||
}
|
||||
return bb.String()
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ func benchmarkReadBulkRequest(b *testing.B, isGzip bool) {
|
||||
r.Reset(dataBytes)
|
||||
_, err := readBulkRequest(r, isGzip, timeField, msgField, processLogMessage)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("unexpected error: %s", err))
|
||||
panic(fmt.Errorf("unexpected error: %w", err))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -3,12 +3,13 @@ package insertutils
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
// CommonParams contains common HTTP parameters used by log ingestion APIs.
|
||||
@@ -73,12 +74,19 @@ func GetCommonParams(r *http.Request) (*CommonParams, error) {
|
||||
// GetProcessLogMessageFunc returns a function, which adds parsed log messages to lr.
|
||||
func (cp *CommonParams) GetProcessLogMessageFunc(lr *logstorage.LogRows) func(timestamp int64, fields []logstorage.Field) {
|
||||
return func(timestamp int64, fields []logstorage.Field) {
|
||||
if len(fields) > *MaxFieldsPerLine {
|
||||
rf := logstorage.RowFormatter(fields)
|
||||
logger.Warnf("dropping log line with %d fields; it exceeds -insert.maxFieldsPerLine=%d; %s", len(fields), *MaxFieldsPerLine, rf)
|
||||
rowsDroppedTotalTooManyFields.Inc()
|
||||
return
|
||||
}
|
||||
|
||||
lr.MustAdd(cp.TenantID, timestamp, fields)
|
||||
if cp.Debug {
|
||||
s := lr.GetRowString(0)
|
||||
lr.ResetKeepSettings()
|
||||
logger.Infof("remoteAddr=%s; requestURI=%s; ignoring log entry because of `debug` query arg: %s", cp.DebugRemoteAddr, cp.DebugRequestURI, s)
|
||||
rowsDroppedTotal.Inc()
|
||||
rowsDroppedTotalDebug.Inc()
|
||||
return
|
||||
}
|
||||
if lr.NeedFlush() {
|
||||
@@ -88,4 +96,5 @@ func (cp *CommonParams) GetProcessLogMessageFunc(lr *logstorage.LogRows) func(ti
|
||||
}
|
||||
}
|
||||
|
||||
var rowsDroppedTotal = metrics.NewCounter(`vl_rows_dropped_total{reason="debug"}`)
|
||||
var rowsDroppedTotalDebug = metrics.NewCounter(`vl_rows_dropped_total{reason="debug"}`)
|
||||
var rowsDroppedTotalTooManyFields = metrics.NewCounter(`vl_rows_dropped_total{reason="too_many_fields"}`)
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
package insertutils
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
)
|
||||
|
||||
var (
|
||||
// MaxLineSizeBytes is the maximum length of a single line for /insert/* handlers
|
||||
MaxLineSizeBytes = flagutil.NewBytes("insert.maxLineSizeBytes", 256*1024, "The maximum size of a single line, which can be read by /insert/* handlers")
|
||||
|
||||
// MaxFieldsPerLine is the maximum number of fields per line for /insert/* handlers
|
||||
MaxFieldsPerLine = flag.Int("insert.maxFieldsPerLine", 1000, "The maximum number of log fields per line, which can be read by /insert/* handlers")
|
||||
)
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
|
||||
// RequestHandler processes jsonline insert requests
|
||||
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
startTime := time.Now()
|
||||
w.Header().Add("Content-Type", "application/json")
|
||||
|
||||
if r.Method != "POST" {
|
||||
@@ -35,6 +36,10 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
if err := vlstorage.CanWriteData(); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
|
||||
processLogMessage := cp.GetProcessLogMessageFunc(lr)
|
||||
|
||||
@@ -77,6 +82,11 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
vlstorage.MustAddRows(lr)
|
||||
logstorage.PutLogRows(lr)
|
||||
|
||||
// update jsonlineRequestDuration only for successfully parsed requests.
|
||||
// There is no need in updating jsonlineRequestDuration for request errors,
|
||||
// since their timings are usually much smaller than the timing for successful request parsing.
|
||||
jsonlineRequestDuration.UpdateDuration(startTime)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -109,6 +119,7 @@ func readLine(sc *bufio.Scanner, timeField, msgField string, processLogMessage f
|
||||
p.RenameField(msgField, "_msg")
|
||||
processLogMessage(ts, p.Fields)
|
||||
logjson.PutParser(p)
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
@@ -144,6 +155,7 @@ func parseISO8601Timestamp(s string) (int64, error) {
|
||||
var lineBufferPool bytesutil.ByteBufferPool
|
||||
|
||||
var (
|
||||
requestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
|
||||
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
|
||||
requestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
|
||||
rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
|
||||
jsonlineRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/jsonline"}`)
|
||||
)
|
||||
|
||||
@@ -5,29 +5,31 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
lokiRequestsJSONTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
|
||||
lokiRequestsProtobufTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
|
||||
)
|
||||
|
||||
// RequestHandler processes Loki insert requests
|
||||
//
|
||||
// See https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
|
||||
func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
|
||||
if path != "/api/v1/push" {
|
||||
switch path {
|
||||
case "/api/v1/push":
|
||||
return handleInsert(r, w)
|
||||
case "/ready":
|
||||
// See https://grafana.com/docs/loki/latest/api/#identify-ready-loki-instance
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("ready"))
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// See https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
|
||||
func handleInsert(r *http.Request, w http.ResponseWriter) bool {
|
||||
contentType := r.Header.Get("Content-Type")
|
||||
switch contentType {
|
||||
case "application/json":
|
||||
lokiRequestsJSONTotal.Inc()
|
||||
return handleJSON(r, w)
|
||||
default:
|
||||
// Protobuf request body should be handled by default accoring to https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
|
||||
lokiRequestsProtobufTotal.Inc()
|
||||
// Protobuf request body should be handled by default according to https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
|
||||
return handleProtobuf(r, w)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,12 +18,11 @@ import (
|
||||
"github.com/valyala/fastjson"
|
||||
)
|
||||
|
||||
var (
|
||||
rowsIngestedJSONTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
|
||||
parserPool fastjson.ParserPool
|
||||
)
|
||||
var parserPool fastjson.ParserPool
|
||||
|
||||
func handleJSON(r *http.Request, w http.ResponseWriter) bool {
|
||||
startTime := time.Now()
|
||||
lokiRequestsJSONTotal.Inc()
|
||||
reader := r.Body
|
||||
if r.Header.Get("Content-Encoding") == "gzip" {
|
||||
zr, err := common.GetGzipReader(reader)
|
||||
@@ -48,19 +47,36 @@ func handleJSON(r *http.Request, w http.ResponseWriter) bool {
|
||||
httpserver.Errorf(w, r, "cannot parse common params from request: %s", err)
|
||||
return true
|
||||
}
|
||||
if err := vlstorage.CanWriteData(); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
|
||||
processLogMessage := cp.GetProcessLogMessageFunc(lr)
|
||||
n, err := parseJSONRequest(data, processLogMessage)
|
||||
vlstorage.MustAddRows(lr)
|
||||
logstorage.PutLogRows(lr)
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "cannot parse Loki request: %s", err)
|
||||
httpserver.Errorf(w, r, "cannot parse Loki json request: %s", err)
|
||||
return true
|
||||
}
|
||||
|
||||
rowsIngestedJSONTotal.Add(n)
|
||||
|
||||
// update lokiRequestJSONDuration only for successfully parsed requests
|
||||
// There is no need in updating lokiRequestJSONDuration for request errors,
|
||||
// since their timings are usually much smaller than the timing for successful request parsing.
|
||||
lokiRequestJSONDuration.UpdateDuration(startTime)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
var (
|
||||
lokiRequestsJSONTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
|
||||
rowsIngestedJSONTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
|
||||
lokiRequestJSONDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="json"}`)
|
||||
)
|
||||
|
||||
func parseJSONRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
|
||||
p := parserPool.Get()
|
||||
defer parserPool.Put(p)
|
||||
@@ -155,7 +171,6 @@ func parseJSONRequest(data []byte, processLogMessage func(timestamp int64, field
|
||||
Value: bytesutil.ToUnsafeString(msg),
|
||||
})
|
||||
processLogMessage(ts, fields)
|
||||
|
||||
}
|
||||
rowsIngested += len(lines)
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ func benchmarkParseJSONRequest(b *testing.B, streams, rows, labels int) {
|
||||
for pb.Next() {
|
||||
_, err := parseJSONRequest(data, func(timestamp int64, fields []logstorage.Field) {})
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("unexpected error: %s", err))
|
||||
panic(fmt.Errorf("unexpected error: %w", err))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -19,12 +19,13 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
rowsIngestedProtobufTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
|
||||
bytesBufPool bytesutil.ByteBufferPool
|
||||
pushReqsPool sync.Pool
|
||||
bytesBufPool bytesutil.ByteBufferPool
|
||||
pushReqsPool sync.Pool
|
||||
)
|
||||
|
||||
func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
|
||||
startTime := time.Now()
|
||||
lokiRequestsProtobufTotal.Inc()
|
||||
wcr := writeconcurrencylimiter.GetReader(r.Body)
|
||||
data, err := io.ReadAll(wcr)
|
||||
writeconcurrencylimiter.PutReader(wcr)
|
||||
@@ -38,19 +39,36 @@ func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
|
||||
httpserver.Errorf(w, r, "cannot parse common params from request: %s", err)
|
||||
return true
|
||||
}
|
||||
if err := vlstorage.CanWriteData(); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
lr := logstorage.GetLogRows(cp.StreamFields, cp.IgnoreFields)
|
||||
processLogMessage := cp.GetProcessLogMessageFunc(lr)
|
||||
n, err := parseProtobufRequest(data, processLogMessage)
|
||||
vlstorage.MustAddRows(lr)
|
||||
logstorage.PutLogRows(lr)
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "cannot parse loki request: %s", err)
|
||||
httpserver.Errorf(w, r, "cannot parse Loki protobuf request: %s", err)
|
||||
return true
|
||||
}
|
||||
|
||||
rowsIngestedProtobufTotal.Add(n)
|
||||
|
||||
// update lokiRequestProtobufDuration only for successfully parsed requests
|
||||
// There is no need in updating lokiRequestProtobufDuration for request errors,
|
||||
// since their timings are usually much smaller than the timing for successful request parsing.
|
||||
lokiRequestProtobufDuration.UpdateDuration(startTime)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
var (
|
||||
lokiRequestsProtobufTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
|
||||
rowsIngestedProtobufTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
|
||||
lokiRequestProtobufDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="protobuf"}`)
|
||||
)
|
||||
|
||||
func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
|
||||
bb := bytesBufPool.Get()
|
||||
defer bytesBufPool.Put(bb)
|
||||
@@ -66,7 +84,7 @@ func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, f
|
||||
|
||||
err = req.Unmarshal(bb.B)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse request body: %s", err)
|
||||
return 0, fmt.Errorf("cannot parse request body: %w", err)
|
||||
}
|
||||
|
||||
var commonFields []logstorage.Field
|
||||
@@ -79,7 +97,7 @@ func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, f
|
||||
// Labels are same for all entries in the stream.
|
||||
commonFields, err = parsePromLabels(commonFields[:0], stream.Labels)
|
||||
if err != nil {
|
||||
return rowsIngested, fmt.Errorf("cannot parse stream labels %q: %s", stream.Labels, err)
|
||||
return rowsIngested, fmt.Errorf("cannot parse stream labels %q: %w", stream.Labels, err)
|
||||
}
|
||||
fields := commonFields
|
||||
|
||||
|
||||
@@ -6,8 +6,9 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
)
|
||||
|
||||
func BenchmarkParseProtobufRequest(b *testing.B) {
|
||||
@@ -30,7 +31,7 @@ func benchmarkParseProtobufRequest(b *testing.B, streams, rows, labels int) {
|
||||
for pb.Next() {
|
||||
_, err := parseProtobufRequest(body, func(timestamp int64, fields []logstorage.Field) {})
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("unexpected error: %s", err))
|
||||
panic(fmt.Errorf("unexpected error: %w", err))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -88,6 +88,12 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(path, "/vmui/") {
|
||||
if strings.HasPrefix(path, "/vmui/static/") {
|
||||
// Allow clients caching static contents for long period of time, since it shouldn't change over time.
|
||||
// Path to static contents (such as js and css) must be changed whenever its contents is changed.
|
||||
// See https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/
|
||||
w.Header().Set("Cache-Control", "max-age=31536000")
|
||||
}
|
||||
r.URL.Path = path
|
||||
vmuiFileServer.ServeHTTP(w, r)
|
||||
return true
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
{
|
||||
"files": {
|
||||
"main.css": "./static/css/main.5f461a27.css",
|
||||
"main.js": "./static/js/main.7566144a.js",
|
||||
"static/js/522.b5ae4365.chunk.js": "./static/js/522.b5ae4365.chunk.js",
|
||||
"static/media/Lato-Regular.ttf": "./static/media/Lato-Regular.d714fec1633b69a9c2e9.ttf",
|
||||
"static/media/Lato-Bold.ttf": "./static/media/Lato-Bold.32360ba4b57802daa4d6.ttf",
|
||||
"main.css": "./static/css/main.d1313636.css",
|
||||
"main.js": "./static/js/main.1919fefe.js",
|
||||
"static/js/522.da77e7b3.chunk.js": "./static/js/522.da77e7b3.chunk.js",
|
||||
"static/media/MetricsQL.md": "./static/media/MetricsQL.8644fd7c964802dd34a9.md",
|
||||
"index.html": "./index.html"
|
||||
},
|
||||
"entrypoints": [
|
||||
"static/css/main.5f461a27.css",
|
||||
"static/js/main.7566144a.js"
|
||||
"static/css/main.d1313636.css",
|
||||
"static/js/main.1919fefe.js"
|
||||
]
|
||||
}
|
||||
@@ -1 +1 @@
|
||||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.7566144a.js"></script><link href="./static/css/main.5f461a27.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
||||
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="UI for VictoriaMetrics"/><link rel="apple-touch-icon" href="./apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="./favicon-32x32.png"><link rel="manifest" href="./manifest.json"/><title>VM UI</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary_large_image"><meta name="twitter:image" content="./preview.jpg"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:site" content="@VictoriaMetrics"><meta property="og:title" content="Metric explorer for VictoriaMetrics"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta property="og:image" content="./preview.jpg"><meta property="og:type" content="website"><script defer="defer" src="./static/js/main.1919fefe.js"></script><link href="./static/css/main.d1313636.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
||||
File diff suppressed because one or more lines are too long
1
app/vlselect/vmui/static/css/main.d1313636.css
Normal file
1
app/vlselect/vmui/static/css/main.d1313636.css
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
2
app/vlselect/vmui/static/js/main.1919fefe.js
Normal file
2
app/vlselect/vmui/static/js/main.1919fefe.js
Normal file
File diff suppressed because one or more lines are too long
@@ -7,7 +7,7 @@
|
||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
||||
|
||||
/**
|
||||
* @remix-run/router v1.7.2
|
||||
* @remix-run/router v1.10.0
|
||||
*
|
||||
* Copyright (c) Remix Software Inc.
|
||||
*
|
||||
@@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* React Router DOM v6.14.2
|
||||
* React Router DOM v6.17.0
|
||||
*
|
||||
* Copyright (c) Remix Software Inc.
|
||||
*
|
||||
@@ -29,7 +29,7 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* React Router v6.14.2
|
||||
* React Router v6.17.0
|
||||
*
|
||||
* Copyright (c) Remix Software Inc.
|
||||
*
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
2058
app/vlselect/vmui/static/media/MetricsQL.8644fd7c964802dd34a9.md
Normal file
2058
app/vlselect/vmui/static/media/MetricsQL.8644fd7c964802dd34a9.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -3,14 +3,17 @@ package vlstorage
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -29,6 +32,8 @@ var (
|
||||
"see https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields ; see also -logIngestedRows")
|
||||
logIngestedRows = flag.Bool("logIngestedRows", false, "Whether to log all the ingested log entries; this can be useful for debugging of data ingestion; "+
|
||||
"see https://docs.victoriametrics.com/VictoriaLogs/data-ingestion/ ; see also -logNewStreams")
|
||||
minFreeDiskSpaceBytes = flagutil.NewBytes("storage.minFreeDiskSpaceBytes", 10e6, "The minimum free disk space at -storageDataPath after which "+
|
||||
"the storage stops accepting new data")
|
||||
)
|
||||
|
||||
// Init initializes vlstorage.
|
||||
@@ -39,15 +44,16 @@ func Init() {
|
||||
logger.Panicf("BUG: Init() has been already called")
|
||||
}
|
||||
|
||||
if retentionPeriod.Msecs < 24*3600*1000 {
|
||||
if retentionPeriod.Duration() < 24*time.Hour {
|
||||
logger.Fatalf("-retentionPeriod cannot be smaller than a day; got %s", retentionPeriod)
|
||||
}
|
||||
cfg := &logstorage.StorageConfig{
|
||||
Retention: time.Millisecond * time.Duration(retentionPeriod.Msecs),
|
||||
FlushInterval: *inmemoryDataFlushInterval,
|
||||
FutureRetention: time.Millisecond * time.Duration(futureRetention.Msecs),
|
||||
LogNewStreams: *logNewStreams,
|
||||
LogIngestedRows: *logIngestedRows,
|
||||
Retention: retentionPeriod.Duration(),
|
||||
FlushInterval: *inmemoryDataFlushInterval,
|
||||
FutureRetention: futureRetention.Duration(),
|
||||
LogNewStreams: *logNewStreams,
|
||||
LogIngestedRows: *logIngestedRows,
|
||||
MinFreeDiskSpaceBytes: minFreeDiskSpaceBytes.N,
|
||||
}
|
||||
logger.Infof("opening storage at -storageDataPath=%s", *storageDataPath)
|
||||
startTime := time.Now()
|
||||
@@ -74,7 +80,21 @@ func Stop() {
|
||||
var strg *logstorage.Storage
|
||||
var storageMetrics *metrics.Set
|
||||
|
||||
// CanWriteData returns non-nil error if it cannot write data to vlstorage.
|
||||
func CanWriteData() error {
|
||||
if strg.IsReadOnly() {
|
||||
return &httpserver.ErrorWithStatusCode{
|
||||
Err: fmt.Errorf("cannot add rows into storage in read-only mode; the storage can be in read-only mode "+
|
||||
"because of lack of free disk space at -storageDataPath=%s", *storageDataPath),
|
||||
StatusCode: http.StatusTooManyRequests,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// MustAddRows adds lr to vlstorage
|
||||
//
|
||||
// It is advised to call CanWriteData() before calling MustAddRows()
|
||||
func MustAddRows(lr *logstorage.LogRows) {
|
||||
strg.MustAddRows(lr)
|
||||
}
|
||||
@@ -107,6 +127,12 @@ func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
|
||||
ms.NewGauge(fmt.Sprintf(`vl_free_disk_space_bytes{path=%q}`, *storageDataPath), func() float64 {
|
||||
return float64(fs.MustGetFreeSpace(*storageDataPath))
|
||||
})
|
||||
ms.NewGauge(fmt.Sprintf(`vl_storage_is_read_only{path=%q}`, *storageDataPath), func() float64 {
|
||||
if m().IsReadOnly {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
})
|
||||
|
||||
ms.NewGauge(`vl_active_merges{type="inmemory"}`, func() float64 {
|
||||
return float64(m().InmemoryActiveMerges)
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
`vmagent` is a tiny agent which helps you collect metrics from various sources,
|
||||
[relabel and filter the collected metrics](#relabeling)
|
||||
and store them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other storage systems via Prometheus `remote_write` protocol.
|
||||
or any other storage systems via Prometheus `remote_write` protocol
|
||||
or via [VictoriaMetrics `remote_write` protocol](#victoriametrics-remote-write-protocol).
|
||||
|
||||
See [Quick Start](#quick-start) for details.
|
||||
|
||||
@@ -45,7 +46,7 @@ additionally to [discovering Prometheus-compatible targets and scraping metrics
|
||||
|
||||
## Quick Start
|
||||
|
||||
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) (
|
||||
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) (
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags)),
|
||||
unpack it and pass the following flags to the `vmagent` binary in order to start scraping Prometheus-compatible targets
|
||||
and sending the data to the Prometheus-compatible remote storage:
|
||||
@@ -59,7 +60,7 @@ and sending the data to the Prometheus-compatible remote storage:
|
||||
Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent)
|
||||
to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`:
|
||||
|
||||
```console
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
@@ -68,7 +69,7 @@ the data to [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-V
|
||||
|
||||
Example command for scraping Prometheus targets and writing the data to single-node VictoriaMetrics:
|
||||
|
||||
```console
|
||||
```bash
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
@@ -94,6 +95,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
|
||||
* InfluxDB line protocol via `http://<vmagent>:8429/write`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
|
||||
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
|
||||
* OpenTelemetry http API. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#sending-data-via-opentelemetry).
|
||||
* NewRelic API. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent).
|
||||
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
|
||||
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
|
||||
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
|
||||
@@ -109,7 +111,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
|
||||
|
||||
* Sending `SIGHUP` signal to `vmagent` process:
|
||||
|
||||
```console
|
||||
```bash
|
||||
kill -SIGHUP `pidof vmagent`
|
||||
```
|
||||
|
||||
@@ -172,6 +174,13 @@ by routing outgoing samples for the same time series of [counter](https://docs.v
|
||||
and [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram) types from top-level `vmagent` instances
|
||||
to the same second-level `vmagent` instance, so they are aggregated properly.
|
||||
|
||||
If `-remoteWrite.shardByURL` command-line flag is set, then all the metric labels are used for even sharding
|
||||
among remote storage systems specified in `-remoteWrite.url`. Sometimes it may be needed to use only a particular
|
||||
set of labels for sharding. For example, it may be needed to route all the metrics with the same `instance` label
|
||||
to the same `-remoteWrite.url`. In this case you can specify comma-separated list of these labels in the `-remoteWrite.shardByURL.labels`
|
||||
command-line flag. For example, `-remoteWrite.shardByURL.labels=instance,__name__` would shard metrics with the same name and `instance`
|
||||
label to the same `-remoteWrite.url`.
|
||||
|
||||
See also [how to scrape big number of targets](#scraping-big-number-of-targets).
|
||||
|
||||
### Relabeling and filtering
|
||||
@@ -317,7 +326,7 @@ in the `scrape_config_files` section of `-promscrape.config` file. For example,
|
||||
loading scrape configs from all the `*.yml` files under `configs` directory, from `single_scrape_config.yml` local file
|
||||
and from `https://config-server/scrape_config.yml` url:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_config_files:
|
||||
- configs/*.yml
|
||||
- single_scrape_config.yml
|
||||
@@ -327,7 +336,7 @@ scrape_config_files:
|
||||
Every referred file can contain arbitrary number of [supported scrape configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs).
|
||||
There is no need in specifying top-level `scrape_configs` section in these files. For example:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
- job_name: foo
|
||||
static_configs:
|
||||
- targets: ["vmagent:8429"]
|
||||
@@ -367,7 +376,7 @@ Extra labels can be added to metrics collected by `vmagent` via the following me
|
||||
For example, the following command starts `vmagent`, which adds `{datacenter="foobar"}` label to all the metrics pushed
|
||||
to all the configured remote storage systems (all the `-remoteWrite.url` flag values):
|
||||
|
||||
```
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.label=datacenter=foobar ...
|
||||
```
|
||||
|
||||
@@ -495,13 +504,16 @@ with [additional enhancements](#relabeling-enhancements). The relabeling can be
|
||||
This relabeling can be debugged via `http://vmagent:8429/metric-relabel-debug` page. See [these docs](#relabel-debug) for details.
|
||||
|
||||
* At the `-remoteWrite.urlRelabelConfig` files. This relabeling is used for modifying labels for metrics
|
||||
and for dropping unneeded metrics before sending them to a particular `-remoteWrite.url`.
|
||||
and for dropping unneeded metrics before sending them to the particular `-remoteWrite.url`.
|
||||
|
||||
This relabeling can be debugged via `http://vmagent:8429/metric-relabel-debug` page. See [these docs](#relabel-debug) for details.
|
||||
|
||||
All the files with relabeling configs can contain special placeholders in the form `%{ENV_VAR}`,
|
||||
which are replaced by the corresponding environment variable values.
|
||||
|
||||
[Streaming aggregation](https://docs.victoriametrics.com/stream-aggregation.html), if configured,
|
||||
is pefrormed after applying all the relabeling stages mentioned above.
|
||||
|
||||
The following articles contain useful information about Prometheus relabeling:
|
||||
|
||||
* [Cookbook for common relabeling tasks](https://docs.victoriametrics.com/relabeling.html)
|
||||
@@ -732,7 +744,7 @@ stream parsing mode can be explicitly enabled in the following places:
|
||||
|
||||
Examples:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'big-federate'
|
||||
stream_parse: true
|
||||
@@ -754,24 +766,24 @@ as soon as it is parsed in stream parsing mode.
|
||||
|
||||
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
|
||||
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` horizontal scaling, sharding and clustering).
|
||||
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
|
||||
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
|
||||
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
|
||||
spread scrape targets among a cluster of two `vmagent` instances:
|
||||
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag.
|
||||
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values
|
||||
in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster specified via `-promscrape.cluster.membersCount`.
|
||||
For example, the following commands spread scrape targets among a cluster of two `vmagent` instances:
|
||||
|
||||
```
|
||||
```text
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
|
||||
```
|
||||
|
||||
The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes.
|
||||
The pod name must end with a number in the range `0 ... promscrape.cluster.memberNum-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
|
||||
The pod name must end with a number in the range `0 ... promscrape.cluster.membersCount-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
|
||||
|
||||
By default, each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
|
||||
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
|
||||
start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances:
|
||||
|
||||
```
|
||||
```text
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=2 -promscrape.config=/path/to/config.yml ...
|
||||
@@ -781,6 +793,14 @@ If each target is scraped by multiple `vmagent` instances, then data deduplicati
|
||||
The `-dedup.minScrapeInterval` must be set to the `scrape_interval` configured at `-promscrape.config`.
|
||||
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
|
||||
|
||||
The `-promscrape.cluster.memberLabel` command-line flag allows specifying a name for `member num` label to add to all the scraped metrics.
|
||||
The value of the `member num` label is set to `-promscrape.cluster.memberNum`. For example, the following config instructs adding `vmagent_instance="0"` label
|
||||
to all the metrics scraped by the given `vmagent` instance:
|
||||
|
||||
```text
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.cluster.memberLabel=vmagent_instance
|
||||
```
|
||||
|
||||
See also [how to shard data among multiple remote storage systems](#sharding-among-remote-storages).
|
||||
|
||||
|
||||
@@ -804,7 +824,7 @@ See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679)
|
||||
`vmagent` supports scraping targets via http, https and socks5 proxies. Proxy address must be specified in `proxy_url` option. For example, the following scrape config instructs
|
||||
target scraping via https proxy at `https://proxy-addr:1234`:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
proxy_url: https://proxy-addr:1234
|
||||
@@ -821,7 +841,7 @@ Proxy can be configured with the following optional settings:
|
||||
|
||||
For example:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
proxy_url: https://proxy-addr:1234
|
||||
@@ -971,7 +991,7 @@ If you have suggestions for improvements or have found a bug - please open an is
|
||||
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at the beginning of some interval,
|
||||
then `scrape_align_interval` option must be used. For example, the following config aligns hourly scrapes to the beginning of hour:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
scrape_interval: 1h
|
||||
@@ -981,7 +1001,7 @@ If you have suggestions for improvements or have found a bug - please open an is
|
||||
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at specific offset, then `scrape_offset` option must be used.
|
||||
For example, the following config instructs `vmagent` to scrape the target at 10 seconds of every minute:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
scrape_interval: 1m
|
||||
@@ -994,14 +1014,14 @@ If you have suggestions for improvements or have found a bug - please open an is
|
||||
|
||||
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
- action: keep_if_equal
|
||||
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
|
||||
```
|
||||
|
||||
The following relabeling rule may be added to `relabel_configs` section in order to filter out init container pods:
|
||||
|
||||
```yml
|
||||
```yaml
|
||||
- action: drop
|
||||
source_labels: [__meta_kubernetes_pod_container_init]
|
||||
regex: true
|
||||
@@ -1016,8 +1036,9 @@ See also [troubleshooting docs](https://docs.victoriametrics.com/Troubleshooting
|
||||
* [Reading metrics from Kafka](#reading-metrics-from-kafka)
|
||||
* [Writing metrics to Kafka](#writing-metrics-to-kafka)
|
||||
|
||||
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
|
||||
The enterprise version of vmagent is available for evaluation at [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
|
||||
in `vmutils-...-enterprise.tar.gz` archives and in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
|
||||
See how to request a free trial license [here](https://victoriametrics.com/products/enterprise/trial/).
|
||||
|
||||
### Reading metrics from Kafka
|
||||
|
||||
@@ -1044,7 +1065,7 @@ For example, `-kafka.consumer.topic.brokers=host1:9092;host2:9092`.
|
||||
The following command starts `vmagent`, which reads metrics in InfluxDB line protocol format from Kafka broker at `localhost:9092`
|
||||
from the topic `metrics-by-telegraf` and sends them to remote storage at `http://localhost:8428/api/v1/write`:
|
||||
|
||||
```console
|
||||
```bash
|
||||
./bin/vmagent -remoteWrite.url=http://localhost:8428/api/v1/write \
|
||||
-kafka.consumer.topic.brokers=localhost:9092 \
|
||||
-kafka.consumer.topic.format=influx \
|
||||
@@ -1064,10 +1085,10 @@ data_format = "influx"
|
||||
#### Command-line flags for Kafka consumer
|
||||
|
||||
These command-line flags are available only in [enterprise](https://docs.victoriametrics.com/enterprise.html) version of `vmagent`,
|
||||
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) page
|
||||
which can be downloaded for evaluation from [releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) page
|
||||
(see `vmutils-...-enterprise.tar.gz` archives) and from [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags) with tags containing `enterprise` suffix.
|
||||
|
||||
```
|
||||
```text
|
||||
-kafka.consumer.topic array
|
||||
Kafka topic names for data consumption.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
@@ -1112,25 +1133,25 @@ Two types of auth are supported:
|
||||
|
||||
* sasl with username and password:
|
||||
|
||||
```console
|
||||
```bash
|
||||
./bin/vmagent -remoteWrite.url=kafka://localhost:9092/?topic=prom-rw&security.protocol=SASL_SSL&sasl.mechanisms=PLAIN -remoteWrite.basicAuth.username=user -remoteWrite.basicAuth.password=password
|
||||
```
|
||||
|
||||
* tls certificates:
|
||||
|
||||
```console
|
||||
```bash
|
||||
./bin/vmagent -remoteWrite.url=kafka://localhost:9092/?topic=prom-rw&security.protocol=SSL -remoteWrite.tlsCAFile=/opt/ca.pem -remoteWrite.tlsCertFile=/opt/cert.pem -remoteWrite.tlsKeyFile=/opt/key.pem
|
||||
```
|
||||
|
||||
## How to build from sources
|
||||
|
||||
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-...` archives.
|
||||
We recommend using [official binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) - `vmagent` is located in the `vmutils-...` archives.
|
||||
|
||||
It may be needed to build `vmagent` from source code when developing or testing new feature or bugfix.
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make vmagent` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds the `vmagent` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -1149,7 +1170,7 @@ The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
|
||||
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
|
||||
|
||||
```console
|
||||
```bash
|
||||
ROOT_IMAGE=scratch make package-vmagent
|
||||
```
|
||||
|
||||
@@ -1159,7 +1180,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make vmagent-linux-arm` or `make vmagent-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
It builds `vmagent-linux-arm` or `vmagent-linux-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
@@ -1177,7 +1198,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
```bash
|
||||
curl http://0.0.0.0:8429/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
@@ -1187,7 +1208,7 @@ curl http://0.0.0.0:8429/debug/pprof/heap > mem.pprof
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
```bash
|
||||
curl http://0.0.0.0:8429/debug/pprof/profile > cpu.pprof
|
||||
```
|
||||
|
||||
@@ -1203,7 +1224,7 @@ It is safe sharing the collected profiles from security point of view, since the
|
||||
|
||||
`vmagent` can be fine-tuned with various command-line flags. Run `./vmagent -help` in order to see the full list of these flags with their descriptions and default values:
|
||||
|
||||
```
|
||||
```text
|
||||
./vmagent -help
|
||||
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes them to VictoriaMetrics.
|
||||
@@ -1212,10 +1233,6 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
|
||||
-cacheExpireDuration duration
|
||||
Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s)
|
||||
-clients.docker
|
||||
Decides whether a docker container be brought up automatically
|
||||
-clients.semaphore
|
||||
Tells if the job is running on Semaphore
|
||||
-configAuthKey string
|
||||
Authorization key for accessing /config page. It must be passed via authKey query arg
|
||||
-csvTrimTimestamp duration
|
||||
@@ -1236,7 +1253,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-filestream.disableFadvise
|
||||
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
|
||||
-flagsAuthKey string
|
||||
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-fs.disableMmap
|
||||
@@ -1251,6 +1270,12 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
|
||||
-http.header.csp string
|
||||
Value for 'Content-Security-Policy' header
|
||||
-http.header.frameOptions string
|
||||
Value for 'X-Frame-Options' header
|
||||
-http.header.hsts string
|
||||
Value for 'Strict-Transport-Security' header
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
@@ -1299,34 +1324,40 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
-internStringMaxLen int
|
||||
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
|
||||
-kafka.consumer.topic array
|
||||
Kafka topic names for data consumption. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Kafka topic names for data consumption. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.basicAuth.password array
|
||||
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Optional basic auth password for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.basicAuth.username array
|
||||
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Optional basic auth username for -kafka.consumer.topic. Must be used in conjunction with any supported auth methods for kafka client, specified by flag -kafka.consumer.topic.options='security.protocol=SASL_SSL;sasl.mechanisms=PLAIN' . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.brokers array
|
||||
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
List of brokers to connect for given topic, e.g. -kafka.consumer.topic.broker=host-1:9092;host-2:9092 . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.concurrency array
|
||||
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Configures consumer concurrency for topic specified via -kafka.consumer.topic flag.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default 1)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.defaultFormat string
|
||||
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
|
||||
Expected data format in the topic if -kafka.consumer.topic.format is skipped. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default "promremotewrite")
|
||||
-kafka.consumer.topic.format array
|
||||
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
data format for corresponding kafka topic. Valid formats: influx, prometheus, promremotewrite, graphite, jsonline . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.groupID array
|
||||
Defines group.id for topic. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Defines group.id for topic. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.isGzipped array
|
||||
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Enables gzip setting for topic messages payload. Only prometheus, jsonline and influx formats accept gzipped messages.This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-kafka.consumer.topic.options array
|
||||
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Optional key=value;key1=value2 settings for topic consumer. See full configuration options at https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-license string
|
||||
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
|
||||
-license.forceOffline
|
||||
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
|
||||
-licenseFile string
|
||||
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
@@ -1344,7 +1375,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxConcurrentInserts int
|
||||
The maximum number of concurrent insert requests. The default value should work for most cases, since it minimizes memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
|
||||
The maximum number of concurrent insert requests. Default value should work for most cases, since it minimizes the memory usage. The default value can be increased when clients send data over slow networks. See also -insert.maxQueueDuration (default 8)
|
||||
-maxInsertRequestSize size
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432)
|
||||
@@ -1355,6 +1386,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-newrelic.maxInsertRequestSize size
|
||||
The maximum size in bytes of a single NewRelic request to /newrelic/infra/v2/metrics/events/bulk
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864)
|
||||
-opentsdbHTTPListenAddr string
|
||||
TCP address to listen for OpenTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty. See also -opentsdbHTTPListenAddr.useProxyProtocol
|
||||
-opentsdbHTTPListenAddr.useProxyProtocol
|
||||
@@ -1376,14 +1410,16 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1)
|
||||
-promscrape.azureSDCheckInterval duration
|
||||
Interval for checking for changes in Azure. This works only if azure_sd_configs is configured in '-promscrape.config' file. See https://docs.victoriametrics.com/sd_configs.html#azure_sd_configs for details (default 1m0s)
|
||||
-promscrape.cluster.memberLabel string
|
||||
If non-empty, then the label with this name and the -promscrape.cluster.memberNum value is added to all the scraped metrics. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
|
||||
-promscrape.cluster.memberNum string
|
||||
The number of number in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name (default "0")
|
||||
The number of vmagent instance in the cluster of scrapers. It must be a unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster. Can be specified as pod name of Kubernetes StatefulSet - pod-name-Num, where Num is a numeric part of pod name. See also -promscrape.cluster.memberLabel . See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default "0")
|
||||
-promscrape.cluster.membersCount int
|
||||
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
The number of members in a cluster of scrapers. Each member must have a unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default, cluster scraping is disabled, i.e. a single scraper scrapes all the targets. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
|
||||
-promscrape.cluster.name string
|
||||
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
|
||||
Optional name of the cluster. If multiple vmagent clusters scrape the same targets, then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info
|
||||
-promscrape.cluster.replicationFactor int
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets for more info (default 1)
|
||||
-promscrape.config string
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. The path can point to local file and to http url. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
-promscrape.config.dryRun
|
||||
@@ -1391,7 +1427,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
-promscrape.config.strictParse
|
||||
Whether to deny unsupported fields in -promscrape.config . Set to false in order to silently skip unsupported fields (default true)
|
||||
-promscrape.configCheckInterval duration
|
||||
Interval for checking for changes in '-promscrape.config' file. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
Interval for checking for changes in -promscrape.config file. By default, the checking is disabled. See how to reload -promscrape.config file at https://docs.victoriametrics.com/vmagent.html#configuration-update
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
@@ -1533,7 +1569,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
-remoteWrite.maxDiskUsagePerURL array
|
||||
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. Disk usage is unlimited if the value is set to 0
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB.
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB. (default 0)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.maxHourlySeries int
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/vmagent.html#cardinality-limiter
|
||||
@@ -1563,28 +1599,31 @@ See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
-remoteWrite.queues int
|
||||
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs (default 8)
|
||||
-remoteWrite.rateLimit array
|
||||
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
|
||||
Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage (default 0)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.relabelConfig string
|
||||
Optional path to file with relabeling configs, which are applied to all the metrics before sending them to -remoteWrite.url. See also -remoteWrite.urlRelabelConfig. The path can point either to local file or to http url. See https://docs.victoriametrics.com/vmagent.html#relabeling
|
||||
-remoteWrite.roundDigits array
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics (default 100)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.sendTimeout array
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)
|
||||
Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m0s)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.shardByURL
|
||||
Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages
|
||||
-remoteWrite.shardByURL.labels array
|
||||
Optional list of labels, which must be used for sharding outgoing samples among remote storage systems if -remoteWrite.shardByURL command-line flag is set. By default all the labels are used for sharding in order to gain even distribution of series over the specified -remoteWrite.url systems
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.showURL
|
||||
Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
-remoteWrite.significantFigures array
|
||||
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits
|
||||
The number of significant figures to leave in metric values before writing them to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits (default 0)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.streamAggr.config array
|
||||
Optional path to file with stream aggregation config. See https://docs.victoriametrics.com/stream-aggregation.html . See also -remoteWrite.streamAggr.keepInput, -remoteWrite.streamAggr.dropInput and -remoteWrite.streamAggr.dedupInterval
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.streamAggr.dedupInterval array
|
||||
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero
|
||||
Input samples are de-duplicated with this interval before being aggregated. Only the last sample per each time series per each interval is aggregated if the interval is greater than zero (default 0s)
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.streamAggr.dropInput array
|
||||
Whether to drop all the input samples after the aggregation with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/stream-aggregation.html
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
|
||||
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/datadog/stream"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -29,12 +29,12 @@ func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
|
||||
return err
|
||||
}
|
||||
ce := req.Header.Get("Content-Encoding")
|
||||
return stream.Parse(req.Body, ce, func(series []parser.Series) error {
|
||||
return stream.Parse(req.Body, ce, func(series []datadog.Series) error {
|
||||
return insertRows(at, series, extraLabels)
|
||||
})
|
||||
}
|
||||
|
||||
func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmarshal.Label) error {
|
||||
func insertRows(at *auth.Token, series []datadog.Series, extraLabels []prompbmarshal.Label) error {
|
||||
ctx := common.GetPushCtx()
|
||||
defer common.PutPushCtx(ctx)
|
||||
|
||||
@@ -63,7 +63,7 @@ func insertRows(at *auth.Token, series []parser.Series, extraLabels []prompbmars
|
||||
})
|
||||
}
|
||||
for _, tag := range ss.Tags {
|
||||
name, value := parser.SplitTag(tag)
|
||||
name, value := datadog.SplitTag(tag)
|
||||
if name == "host" {
|
||||
name = "exported_host"
|
||||
}
|
||||
|
||||
@@ -11,11 +11,14 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/datadog"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/native"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/newrelic"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentelemetry"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdbhttp"
|
||||
@@ -39,7 +42,6 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -208,7 +210,7 @@ func getAuthTokenFromPath(path string) (*auth.Token, error) {
|
||||
if p.Suffix != "opentsdb/api/put" {
|
||||
return nil, fmt.Errorf("unsupported path requested: %q; expecting 'opentsdb/api/put'", p.Suffix)
|
||||
}
|
||||
return auth.NewToken(p.AuthToken)
|
||||
return auth.NewTokenPossibleMultitenant(p.AuthToken)
|
||||
}
|
||||
|
||||
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
@@ -251,7 +253,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
w.WriteHeader(statusCode)
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(path, "datadog/") {
|
||||
if strings.HasPrefix(path, "/datadog/") {
|
||||
// Trim suffix from paths starting from /datadog/ in order to support legacy DataDog agent.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2670
|
||||
path = strings.TrimSuffix(path, "/")
|
||||
@@ -318,6 +320,29 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
case "/newrelic":
|
||||
newrelicCheckRequest.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"status":"ok"}`)
|
||||
return true
|
||||
case "/newrelic/inventory/deltas":
|
||||
newrelicInventoryRequests.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
|
||||
return true
|
||||
case "/newrelic/infra/v2/metrics/events/bulk":
|
||||
newrelicWriteRequests.Inc()
|
||||
if err := newrelic.InsertHandlerForHTTP(nil, r); err != nil {
|
||||
newrelicWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"status":"ok"}`)
|
||||
return true
|
||||
case "/datadog/api/v1/series":
|
||||
datadogWriteRequests.Inc()
|
||||
if err := datadog.InsertHandlerForHTTP(nil, r); err != nil {
|
||||
@@ -518,6 +543,29 @@ func processMultitenantRequest(w http.ResponseWriter, r *http.Request, path stri
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
case "newrelic":
|
||||
newrelicCheckRequest.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"status":"ok"}`)
|
||||
return true
|
||||
case "newrelic/inventory/deltas":
|
||||
newrelicInventoryRequests.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"payload":{"version": 1, "state": {}, "reset": "false"}}`)
|
||||
return true
|
||||
case "newrelic/infra/v2/metrics/events/bulk":
|
||||
newrelicWriteRequests.Inc()
|
||||
if err := newrelic.InsertHandlerForHTTP(at, r); err != nil {
|
||||
newrelicWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(202)
|
||||
fmt.Fprintf(w, `{"status":"ok"}`)
|
||||
return true
|
||||
case "datadog/api/v1/series":
|
||||
datadogWriteRequests.Inc()
|
||||
if err := datadog.InsertHandlerForHTTP(at, r); err != nil {
|
||||
@@ -590,6 +638,12 @@ var (
|
||||
opentelemetryPushRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
|
||||
opentelemetryPushErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/opentelemetry/api/v1/push", protocol="opentelemetry"}`)
|
||||
|
||||
newrelicWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
|
||||
newrelicWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/newrelic/infra/v2/metrics/events/bulk", protocol="newrelic"}`)
|
||||
|
||||
newrelicInventoryRequests = metrics.NewCounter(`vm_http_requests_total{path="/newrelic/inventory/deltas", protocol="newrelic"}`)
|
||||
newrelicCheckRequest = metrics.NewCounter(`vm_http_requests_total{path="/newrelic", protocol="newrelic"}`)
|
||||
|
||||
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
|
||||
promscrapeServiceDiscoveryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/service-discovery"}`)
|
||||
|
||||
|
||||
86
app/vmagent/newrelic/request_handler.go
Normal file
86
app/vmagent/newrelic/request_handler.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package newrelic
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/newrelic"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/newrelic/stream"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
|
||||
)
|
||||
|
||||
var (
|
||||
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="newrelic"}`)
|
||||
rowsTenantInserted = tenantmetrics.NewCounterMap(`vmagent_tenant_inserted_rows_total{type="newrelic"}`)
|
||||
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="newrelic"}`)
|
||||
)
|
||||
|
||||
// InsertHandlerForHTTP processes remote write for NewRelic POST /infra/v2/metrics/events/bulk request.
|
||||
func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
|
||||
extraLabels, err := parserCommon.GetExtraLabels(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ce := req.Header.Get("Content-Encoding")
|
||||
isGzip := ce == "gzip"
|
||||
return stream.Parse(req.Body, isGzip, func(rows []newrelic.Row) error {
|
||||
return insertRows(at, rows, extraLabels)
|
||||
})
|
||||
}
|
||||
|
||||
func insertRows(at *auth.Token, rows []newrelic.Row, extraLabels []prompbmarshal.Label) error {
|
||||
ctx := common.GetPushCtx()
|
||||
defer common.PutPushCtx(ctx)
|
||||
|
||||
samplesCount := 0
|
||||
tssDst := ctx.WriteRequest.Timeseries[:0]
|
||||
labels := ctx.Labels[:0]
|
||||
samples := ctx.Samples[:0]
|
||||
for i := range rows {
|
||||
r := &rows[i]
|
||||
tags := r.Tags
|
||||
srcSamples := r.Samples
|
||||
for j := range srcSamples {
|
||||
s := &srcSamples[j]
|
||||
labelsLen := len(labels)
|
||||
labels = append(labels, prompbmarshal.Label{
|
||||
Name: "__name__",
|
||||
Value: bytesutil.ToUnsafeString(s.Name),
|
||||
})
|
||||
for k := range tags {
|
||||
t := &tags[k]
|
||||
labels = append(labels, prompbmarshal.Label{
|
||||
Name: bytesutil.ToUnsafeString(t.Key),
|
||||
Value: bytesutil.ToUnsafeString(t.Value),
|
||||
})
|
||||
}
|
||||
samples = append(samples, prompbmarshal.Sample{
|
||||
Value: s.Value,
|
||||
Timestamp: r.Timestamp,
|
||||
})
|
||||
tssDst = append(tssDst, prompbmarshal.TimeSeries{
|
||||
Labels: labels[labelsLen:],
|
||||
Samples: samples[len(samples)-1:],
|
||||
})
|
||||
labels = append(labels, extraLabels...)
|
||||
}
|
||||
samplesCount += len(srcSamples)
|
||||
}
|
||||
ctx.WriteRequest.Timeseries = tssDst
|
||||
ctx.Labels = labels
|
||||
ctx.Samples = samples
|
||||
remotewrite.Push(at, &ctx.WriteRequest)
|
||||
rowsInserted.Add(len(rows))
|
||||
if at != nil {
|
||||
rowsTenantInserted.Get(at).Add(samplesCount)
|
||||
}
|
||||
rowsPerInsert.Update(float64(samplesCount))
|
||||
return nil
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package opentelemetry
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
|
||||
@@ -26,6 +27,9 @@ func InsertHandler(at *auth.Token, req *http.Request) error {
|
||||
return err
|
||||
}
|
||||
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
|
||||
if req.Header.Get("Content-Type") == "application/json" {
|
||||
return fmt.Errorf("json encoding isn't supported for opentelemetry format. Use protobuf encoding")
|
||||
}
|
||||
return stream.ParseStream(req.Body, isGzipped, func(tss []prompbmarshal.TimeSeries) error {
|
||||
return insertRows(at, tss, extraLabels)
|
||||
})
|
||||
|
||||
@@ -32,7 +32,7 @@ func InsertHandler(at *auth.Token, req *http.Request) error {
|
||||
return err
|
||||
}
|
||||
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
|
||||
return stream.Parse(req.Body, defaultTimestamp, isGzipped, func(rows []parser.Row) error {
|
||||
return stream.Parse(req.Body, defaultTimestamp, isGzipped, true, func(rows []parser.Row) error {
|
||||
return insertRows(at, rows, extraLabels)
|
||||
}, func(s string) {
|
||||
httpserver.LogError(req, s)
|
||||
|
||||
@@ -2,6 +2,7 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -26,10 +27,10 @@ var (
|
||||
forceVMProto = flagutil.NewArrayBool("remoteWrite.forceVMProto", "Whether to force VictoriaMetrics remote write protocol for sending data "+
|
||||
"to the corresponding -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol")
|
||||
|
||||
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
|
||||
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", 0, "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+
|
||||
"By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
|
||||
"is sent after temporary unavailability of the remote storage")
|
||||
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m)")
|
||||
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to the corresponding -remoteWrite.url")
|
||||
proxyURL = flagutil.NewArrayString("remoteWrite.proxyURL", "Optional proxy URL for writing data to the corresponding -remoteWrite.url. "+
|
||||
"Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234")
|
||||
|
||||
@@ -105,12 +106,15 @@ type client struct {
|
||||
func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
authCfg, err := getAuthConfig(argIdx)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot initialize auth config for remoteWrite.url=%q: %s", remoteWriteURL, err)
|
||||
logger.Fatalf("cannot initialize auth config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
|
||||
}
|
||||
tlsCfg, err := authCfg.NewTLSConfig()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot initialize tls config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
|
||||
}
|
||||
tlsCfg := authCfg.NewTLSConfig()
|
||||
awsCfg, err := getAWSAPIConfig(argIdx)
|
||||
if err != nil {
|
||||
logger.Fatalf("FATAL: cannot initialize AWS Config for remoteWrite.url=%q: %s", remoteWriteURL, err)
|
||||
logger.Fatalf("cannot initialize AWS Config for -remoteWrite.url=%q: %s", remoteWriteURL, err)
|
||||
}
|
||||
tr := &http.Transport{
|
||||
DialContext: statDial,
|
||||
@@ -134,7 +138,7 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste
|
||||
}
|
||||
hc := &http.Client{
|
||||
Transport: tr,
|
||||
Timeout: sendTimeout.GetOptionalArgOrDefault(argIdx, time.Minute),
|
||||
Timeout: sendTimeout.GetOptionalArg(argIdx),
|
||||
}
|
||||
c := &client{
|
||||
sanitizedURL: sanitizedURL,
|
||||
@@ -169,7 +173,7 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste
|
||||
}
|
||||
|
||||
func (c *client) init(argIdx, concurrency int, sanitizedURL string) {
|
||||
if bytesPerSec := rateLimit.GetOptionalArgOrDefault(argIdx, 0); bytesPerSec > 0 {
|
||||
if bytesPerSec := rateLimit.GetOptionalArg(argIdx); bytesPerSec > 0 {
|
||||
logger.Infof("applying %d bytes per second rate limit for -remoteWrite.url=%q", bytesPerSec, sanitizedURL)
|
||||
c.rl.perSecondLimit = int64(bytesPerSec)
|
||||
}
|
||||
@@ -178,7 +182,7 @@ func (c *client) init(argIdx, concurrency int, sanitizedURL string) {
|
||||
c.bytesSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_bytes_sent_total{url=%q}`, c.sanitizedURL))
|
||||
c.blocksSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_blocks_sent_total{url=%q}`, c.sanitizedURL))
|
||||
c.rateLimit = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_rate_limit{url=%q}`, c.sanitizedURL), func() float64 {
|
||||
return float64(rateLimit.GetOptionalArgOrDefault(argIdx, 0))
|
||||
return float64(rateLimit.GetOptionalArg(argIdx))
|
||||
})
|
||||
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.sanitizedURL))
|
||||
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.sanitizedURL))
|
||||
@@ -322,12 +326,42 @@ func (c *client) runWorker() {
|
||||
}
|
||||
|
||||
func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
|
||||
req, err := c.newRequest(url, body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := c.hc.Do(req)
|
||||
if err == nil {
|
||||
return resp, nil
|
||||
}
|
||||
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
return nil, err
|
||||
}
|
||||
// It is likely connection became stale or timed out during the first request.
|
||||
// Make another attempt in hope request will succeed.
|
||||
// If not, the error should be handled by the caller as usual.
|
||||
// This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139
|
||||
req, err = c.newRequest(url, body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
resp, err = c.hc.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (c *client) newRequest(url string, body []byte) (*http.Request, error) {
|
||||
reqBody := bytes.NewBuffer(body)
|
||||
req, err := http.NewRequest(http.MethodPost, url, reqBody)
|
||||
if err != nil {
|
||||
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", url, err)
|
||||
}
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
err = c.authCfg.SetHeaders(req, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
h := req.Header
|
||||
h.Set("User-Agent", "vmagent")
|
||||
h.Set("Content-Type", "application/x-protobuf")
|
||||
@@ -341,11 +375,10 @@ func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
|
||||
if c.awsCfg != nil {
|
||||
sigv4Hash := awsapi.HashHex(body)
|
||||
if err := c.awsCfg.SignRequest(req, sigv4Hash); err != nil {
|
||||
// there is no need in retry, request will be rejected by client.Do and retried by code below
|
||||
logger.Warnf("cannot sign remoteWrite request with AWS sigv4: %s", err)
|
||||
return nil, fmt.Errorf("cannot sign remoteWrite request with AWS sigv4: %w", err)
|
||||
}
|
||||
}
|
||||
return c.hc.Do(req)
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// sendBlockHTTP sends the given block to c.remoteWriteURL.
|
||||
|
||||
@@ -87,8 +87,8 @@ func initLabelsGlobal() {
|
||||
}
|
||||
}
|
||||
|
||||
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, pcs *promrelabel.ParsedConfigs) []prompbmarshal.TimeSeries {
|
||||
if len(extraLabels) == 0 && pcs.Len() == 0 && !*usePromCompatibleNaming {
|
||||
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, pcs *promrelabel.ParsedConfigs) []prompbmarshal.TimeSeries {
|
||||
if pcs.Len() == 0 && !*usePromCompatibleNaming {
|
||||
// Nothing to change.
|
||||
return tss
|
||||
}
|
||||
@@ -98,34 +98,15 @@ func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLab
|
||||
ts := &tss[i]
|
||||
labelsLen := len(labels)
|
||||
labels = append(labels, ts.Labels...)
|
||||
// extraLabels must be added before applying relabeling according to https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
|
||||
for j := range extraLabels {
|
||||
extraLabel := &extraLabels[j]
|
||||
tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
|
||||
if tmp != nil {
|
||||
tmp.Value = extraLabel.Value
|
||||
} else {
|
||||
labels = append(labels, *extraLabel)
|
||||
}
|
||||
}
|
||||
if *usePromCompatibleNaming {
|
||||
// Replace unsupported Prometheus chars in label names and metric names with underscores.
|
||||
tmpLabels := labels[labelsLen:]
|
||||
for j := range tmpLabels {
|
||||
label := &tmpLabels[j]
|
||||
if label.Name == "__name__" {
|
||||
label.Value = promrelabel.SanitizeName(label.Value)
|
||||
} else {
|
||||
label.Name = promrelabel.SanitizeName(label.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
labels = pcs.Apply(labels, labelsLen)
|
||||
labels = promrelabel.FinalizeLabels(labels[:labelsLen], labels[labelsLen:])
|
||||
if len(labels) == labelsLen {
|
||||
// Drop the current time series, since relabeling removed all the labels.
|
||||
continue
|
||||
}
|
||||
if *usePromCompatibleNaming {
|
||||
fixPromCompatibleNaming(labels[labelsLen:])
|
||||
}
|
||||
tssDst = append(tssDst, prompbmarshal.TimeSeries{
|
||||
Labels: labels[labelsLen:],
|
||||
Samples: ts.Samples,
|
||||
@@ -135,6 +116,29 @@ func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLab
|
||||
return tssDst
|
||||
}
|
||||
|
||||
func (rctx *relabelCtx) appendExtraLabels(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label) {
|
||||
if len(extraLabels) == 0 {
|
||||
return
|
||||
}
|
||||
labels := rctx.labels[:0]
|
||||
for i := range tss {
|
||||
ts := &tss[i]
|
||||
labelsLen := len(labels)
|
||||
labels = append(labels, ts.Labels...)
|
||||
for j := range extraLabels {
|
||||
extraLabel := extraLabels[j]
|
||||
tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
|
||||
if tmp != nil {
|
||||
tmp.Value = extraLabel.Value
|
||||
} else {
|
||||
labels = append(labels, extraLabel)
|
||||
}
|
||||
}
|
||||
ts.Labels = labels[labelsLen:]
|
||||
}
|
||||
rctx.labels = labels
|
||||
}
|
||||
|
||||
type relabelCtx struct {
|
||||
// pool for labels, which are used during the relabeling.
|
||||
labels []prompbmarshal.Label
|
||||
@@ -159,3 +163,15 @@ func putRelabelCtx(rctx *relabelCtx) {
|
||||
rctx.labels = rctx.labels[:0]
|
||||
relabelCtxPool.Put(rctx)
|
||||
}
|
||||
|
||||
func fixPromCompatibleNaming(labels []prompbmarshal.Label) {
|
||||
// Replace unsupported Prometheus chars in label names and metric names with underscores.
|
||||
for i := range labels {
|
||||
label := &labels[i]
|
||||
if label.Name == "__name__" {
|
||||
label.Value = promrelabel.SanitizeMetricName(label.Value)
|
||||
} else {
|
||||
label.Name = promrelabel.SanitizeLabelName(label.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,18 +10,16 @@ import (
|
||||
)
|
||||
|
||||
func TestApplyRelabeling(t *testing.T) {
|
||||
f := func(extraLabels []prompbmarshal.Label, pcs *promrelabel.ParsedConfigs, sTss, sExpTss string) {
|
||||
f := func(pcs *promrelabel.ParsedConfigs, sTss, sExpTss string) {
|
||||
rctx := &relabelCtx{}
|
||||
tss, expTss := parseSeries(sTss), parseSeries(sExpTss)
|
||||
gotTss := rctx.applyRelabeling(tss, extraLabels, pcs)
|
||||
gotTss := rctx.applyRelabeling(tss, pcs)
|
||||
if !reflect.DeepEqual(gotTss, expTss) {
|
||||
t.Fatalf("expected to have: \n%v;\ngot: \n%v", expTss, gotTss)
|
||||
}
|
||||
}
|
||||
|
||||
f(nil, nil, "up", "up")
|
||||
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, nil, "up", `up{foo="bar"}`)
|
||||
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, nil, `up{foo="baz"}`, `up{foo="bar"}`)
|
||||
f(nil, "up", "up")
|
||||
|
||||
pcs, err := promrelabel.ParseRelabelConfigsData([]byte(`
|
||||
- target_label: "foo"
|
||||
@@ -32,11 +30,33 @@ func TestApplyRelabeling(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
f(nil, pcs, `up{foo="baz", env="prod"}`, `up{foo="aaa"}`)
|
||||
f(pcs, `up{foo="baz", env="prod"}`, `up{foo="aaa"}`)
|
||||
|
||||
oldVal := *usePromCompatibleNaming
|
||||
*usePromCompatibleNaming = true
|
||||
f(nil, nil, `foo.bar`, `foo_bar`)
|
||||
f(nil, `foo.bar`, `foo_bar`)
|
||||
*usePromCompatibleNaming = oldVal
|
||||
}
|
||||
|
||||
func TestAppendExtraLabels(t *testing.T) {
|
||||
f := func(extraLabels []prompbmarshal.Label, sTss, sExpTss string) {
|
||||
t.Helper()
|
||||
rctx := &relabelCtx{}
|
||||
tss, expTss := parseSeries(sTss), parseSeries(sExpTss)
|
||||
rctx.appendExtraLabels(tss, extraLabels)
|
||||
if !reflect.DeepEqual(tss, expTss) {
|
||||
t.Fatalf("expected to have: \n%v;\ngot: \n%v", expTss, tss)
|
||||
}
|
||||
}
|
||||
|
||||
f(nil, "up", "up")
|
||||
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, "up", `up{foo="bar"}`)
|
||||
f([]prompbmarshal.Label{{Name: "foo", Value: "bar"}}, `up{foo="baz"}`, `up{foo="bar"}`)
|
||||
f([]prompbmarshal.Label{{Name: "baz", Value: "qux"}}, `up{foo="baz"}`, `up{foo="baz",baz="qux"}`)
|
||||
|
||||
oldVal := *usePromCompatibleNaming
|
||||
*usePromCompatibleNaming = true
|
||||
f([]prompbmarshal.Label{{Name: "foo.bar", Value: "baz"}}, "up", `up{foo.bar="baz"}`)
|
||||
*usePromCompatibleNaming = oldVal
|
||||
}
|
||||
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
@@ -25,9 +23,11 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/cespare/xxhash/v2"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -41,6 +41,9 @@ var (
|
||||
"Pass multiple -remoteWrite.multitenantURL flags in order to replicate data to multiple remote storage systems. See also -remoteWrite.url")
|
||||
shardByURL = flag.Bool("remoteWrite.shardByURL", false, "Whether to shard outgoing series across all the remote storage systems enumerated via -remoteWrite.url . "+
|
||||
"By default the data is replicated across all the -remoteWrite.url . See https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages")
|
||||
shardByURLLabels = flagutil.NewArrayString("remoteWrite.shardByURL.labels", "Optional list of labels, which must be used for sharding outgoing samples "+
|
||||
"among remote storage systems if -remoteWrite.shardByURL command-line flag is set. By default all the labels are used for sharding in order to gain "+
|
||||
"even distribution of series over the specified -remoteWrite.url systems")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+
|
||||
"See also -remoteWrite.maxDiskUsagePerURL")
|
||||
keepDanglingQueues = flag.Bool("remoteWrite.keepDanglingQueues", false, "Keep persistent queues contents at -remoteWrite.tmpDataPath in case there are no matching -remoteWrite.url. "+
|
||||
@@ -49,14 +52,15 @@ var (
|
||||
"isn't enough for sending high volume of collected data to remote storage. Default value is 2 * numberOfAvailableCPUs")
|
||||
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensitive info such as auth key")
|
||||
maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
|
||||
maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
|
||||
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
|
||||
"Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. "+
|
||||
"Disk usage is unlimited if the value is set to 0")
|
||||
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", "The number of significant figures to leave in metric values before writing them "+
|
||||
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", 0, "The number of significant figures to leave in metric values before writing them "+
|
||||
"to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. "+
|
||||
"This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits")
|
||||
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", "Round metric values to this number of decimal digits after the point before writing them to remote storage. "+
|
||||
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", 100, "Round metric values to this number of decimal digits after the point before "+
|
||||
"writing them to remote storage. "+
|
||||
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
|
||||
"By default, digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
|
||||
"This option may be used for improving data compression for the stored metrics")
|
||||
@@ -78,7 +82,7 @@ var (
|
||||
streamAggrDropInput = flagutil.NewArrayBool("remoteWrite.streamAggr.dropInput", "Whether to drop all the input samples after the aggregation "+
|
||||
"with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples "+
|
||||
"are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/stream-aggregation.html")
|
||||
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", "Input samples are de-duplicated with this interval before being aggregated. "+
|
||||
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval before being aggregated. "+
|
||||
"Only the last sample per each time series per each interval is aggregated if the interval is greater than zero")
|
||||
)
|
||||
|
||||
@@ -116,6 +120,8 @@ func InitSecretFlags() {
|
||||
}
|
||||
}
|
||||
|
||||
var shardByURLLabelsMap map[string]struct{}
|
||||
|
||||
// Init initializes remotewrite.
|
||||
//
|
||||
// It must be called after flag.Parse().
|
||||
@@ -152,6 +158,13 @@ func Init() {
|
||||
if *queues <= 0 {
|
||||
*queues = 1
|
||||
}
|
||||
if len(*shardByURLLabels) > 0 {
|
||||
m := make(map[string]struct{}, len(*shardByURLLabels))
|
||||
for _, label := range *shardByURLLabels {
|
||||
m[label] = struct{}{}
|
||||
}
|
||||
shardByURLLabelsMap = m
|
||||
}
|
||||
initLabelsGlobal()
|
||||
|
||||
// Register SIGHUP handler for config reload before loadRelabelConfigs.
|
||||
@@ -258,7 +271,7 @@ func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
|
||||
if *showRemoteWriteURL {
|
||||
sanitizedURL = fmt.Sprintf("%d:%s", i+1, remoteWriteURL)
|
||||
}
|
||||
rwctxs[i] = newRemoteWriteCtx(i, at, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
|
||||
rwctxs[i] = newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
|
||||
}
|
||||
|
||||
if !*keepDanglingQueues {
|
||||
@@ -355,7 +368,7 @@ func Push(at *auth.Token, wr *prompbmarshal.WriteRequest) {
|
||||
var rctx *relabelCtx
|
||||
rcs := allRelabelConfigs.Load()
|
||||
pcsGlobal := rcs.global
|
||||
if pcsGlobal.Len() > 0 || len(labelsGlobal) > 0 {
|
||||
if pcsGlobal.Len() > 0 {
|
||||
rctx = getRelabelCtx()
|
||||
}
|
||||
tss := wr.Timeseries
|
||||
@@ -386,7 +399,7 @@ func Push(at *auth.Token, wr *prompbmarshal.WriteRequest) {
|
||||
}
|
||||
if rctx != nil {
|
||||
rowsCountBeforeRelabel := getRowsCount(tssBlock)
|
||||
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, pcsGlobal)
|
||||
tssBlock = rctx.applyRelabeling(tssBlock, pcsGlobal)
|
||||
rowsCountAfterRelabel := getRowsCount(tssBlock)
|
||||
rowsDroppedByGlobalRelabel.Add(rowsCountBeforeRelabel - rowsCountAfterRelabel)
|
||||
}
|
||||
@@ -418,11 +431,23 @@ func pushBlockToRemoteStorages(rwctxs []*remoteWriteCtx, tssBlock []prompbmarsha
|
||||
if *shardByURL {
|
||||
// Shard the data among rwctxs
|
||||
tssByURL := make([][]prompbmarshal.TimeSeries, len(rwctxs))
|
||||
tmpLabels := promutils.GetLabels()
|
||||
for _, ts := range tssBlock {
|
||||
h := getLabelsHash(ts.Labels)
|
||||
hashLabels := ts.Labels
|
||||
if len(shardByURLLabelsMap) > 0 {
|
||||
hashLabels = tmpLabels.Labels[:0]
|
||||
for _, label := range ts.Labels {
|
||||
if _, ok := shardByURLLabelsMap[label.Name]; ok {
|
||||
hashLabels = append(hashLabels, label)
|
||||
}
|
||||
}
|
||||
}
|
||||
h := getLabelsHash(hashLabels)
|
||||
idx := h % uint64(len(tssByURL))
|
||||
tssByURL[idx] = append(tssByURL[idx], ts)
|
||||
}
|
||||
promutils.PutLabels(tmpLabels)
|
||||
|
||||
// Push sharded data to remote storages in parallel in order to reduce
|
||||
// the time needed for sending the data to multiple remote storage systems.
|
||||
var wg sync.WaitGroup
|
||||
@@ -559,14 +584,14 @@ type remoteWriteCtx struct {
|
||||
rowsDroppedByRelabel *metrics.Counter
|
||||
}
|
||||
|
||||
func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
|
||||
func newRemoteWriteCtx(argIdx int, remoteWriteURL *url.URL, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
|
||||
// strip query params, otherwise changing params resets pq
|
||||
pqURL := *remoteWriteURL
|
||||
pqURL.RawQuery = ""
|
||||
pqURL.Fragment = ""
|
||||
h := xxhash.Sum64([]byte(pqURL.String()))
|
||||
queuePath := filepath.Join(*tmpDataPath, persistentQueueDirname, fmt.Sprintf("%d_%016X", argIdx+1, h))
|
||||
maxPendingBytes := maxPendingBytesPerURL.GetOptionalArgOrDefault(argIdx, 0)
|
||||
maxPendingBytes := maxPendingBytesPerURL.GetOptionalArg(argIdx)
|
||||
if maxPendingBytes != 0 && maxPendingBytes < persistentqueue.DefaultChunkFileSize {
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4195
|
||||
logger.Warnf("rounding the -remoteWrite.maxDiskUsagePerURL=%d to the minimum supported value: %d", maxPendingBytes, persistentqueue.DefaultChunkFileSize)
|
||||
@@ -590,8 +615,8 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
|
||||
c.init(argIdx, *queues, sanitizedURL)
|
||||
|
||||
// Initialize pss
|
||||
sf := significantFigures.GetOptionalArgOrDefault(argIdx, 0)
|
||||
rd := roundDigits.GetOptionalArgOrDefault(argIdx, 100)
|
||||
sf := significantFigures.GetOptionalArg(argIdx)
|
||||
rd := roundDigits.GetOptionalArg(argIdx)
|
||||
pssLen := *queues
|
||||
if n := cgroup.AvailableCPUs(); pssLen > n {
|
||||
// There is no sense in running more than availableCPUs concurrent pendingSeries,
|
||||
@@ -616,7 +641,7 @@ func newRemoteWriteCtx(argIdx int, at *auth.Token, remoteWriteURL *url.URL, maxI
|
||||
// Initialize sas
|
||||
sasFile := streamAggrConfig.GetOptionalArg(argIdx)
|
||||
if sasFile != "" {
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(argIdx, 0)
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArg(argIdx)
|
||||
sas, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot initialize stream aggregators from -remoteWrite.streamAggr.config=%q: %s", sasFile, err)
|
||||
@@ -668,7 +693,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
|
||||
v = tssPool.Get().(*[]prompbmarshal.TimeSeries)
|
||||
tss = append(*v, tss...)
|
||||
rowsCountBeforeRelabel := getRowsCount(tss)
|
||||
tss = rctx.applyRelabeling(tss, nil, pcs)
|
||||
tss = rctx.applyRelabeling(tss, pcs)
|
||||
rowsCountAfterRelabel := getRowsCount(tss)
|
||||
rwctx.rowsDroppedByRelabel.Add(rowsCountBeforeRelabel - rowsCountAfterRelabel)
|
||||
}
|
||||
@@ -705,11 +730,13 @@ var matchIdxsPool bytesutil.ByteBufferPool
|
||||
|
||||
func dropAggregatedSeries(src []prompbmarshal.TimeSeries, matchIdxs []byte, dropInput bool) []prompbmarshal.TimeSeries {
|
||||
dst := src[:0]
|
||||
for i, match := range matchIdxs {
|
||||
if match == 0 {
|
||||
continue
|
||||
if !dropInput {
|
||||
for i, match := range matchIdxs {
|
||||
if match == 1 {
|
||||
continue
|
||||
}
|
||||
dst = append(dst, src[i])
|
||||
}
|
||||
dst = append(dst, src[i])
|
||||
}
|
||||
tail := src[len(dst):]
|
||||
_ = prompbmarshal.ResetTimeSeries(tail)
|
||||
@@ -717,22 +744,38 @@ func dropAggregatedSeries(src []prompbmarshal.TimeSeries, matchIdxs []byte, drop
|
||||
}
|
||||
|
||||
func (rwctx *remoteWriteCtx) pushInternal(tss []prompbmarshal.TimeSeries) {
|
||||
var rctx *relabelCtx
|
||||
var v *[]prompbmarshal.TimeSeries
|
||||
if len(labelsGlobal) > 0 {
|
||||
// Make a copy of tss before adding extra labels in order to prevent
|
||||
// from affecting time series for other remoteWrite.url configs.
|
||||
rctx = getRelabelCtx()
|
||||
v = tssPool.Get().(*[]prompbmarshal.TimeSeries)
|
||||
tss = append(*v, tss...)
|
||||
rctx.appendExtraLabels(tss, labelsGlobal)
|
||||
}
|
||||
|
||||
pss := rwctx.pss
|
||||
idx := atomic.AddUint64(&rwctx.pssNextIdx, 1) % uint64(len(pss))
|
||||
pss[idx].Push(tss)
|
||||
|
||||
if rctx != nil {
|
||||
*v = prompbmarshal.ResetTimeSeries(tss)
|
||||
tssPool.Put(v)
|
||||
putRelabelCtx(rctx)
|
||||
}
|
||||
}
|
||||
|
||||
func (rwctx *remoteWriteCtx) reinitStreamAggr() {
|
||||
sas := rwctx.sas.Load()
|
||||
if sas == nil {
|
||||
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
|
||||
if sasFile == "" {
|
||||
// There is no stream aggregation for rwctx
|
||||
return
|
||||
}
|
||||
|
||||
sasFile := streamAggrConfig.GetOptionalArg(rwctx.idx)
|
||||
logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", sasFile)
|
||||
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, sasFile)).Inc()
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(rwctx.idx, 0)
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArg(rwctx.idx)
|
||||
sasNew, err := streamaggr.LoadFromFile(sasFile, rwctx.pushInternal, dedupInterval)
|
||||
if err != nil {
|
||||
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, sasFile)).Inc()
|
||||
@@ -740,6 +783,7 @@ func (rwctx *remoteWriteCtx) reinitStreamAggr() {
|
||||
logger.Errorf("cannot reload stream aggregation config from -remoteWrite.streamAggr.config=%q; continue using the previously loaded config; error: %s", sasFile, err)
|
||||
return
|
||||
}
|
||||
sas := rwctx.sas.Load()
|
||||
if !sasNew.Equal(sas) {
|
||||
sasOld := rwctx.sas.Swap(sasNew)
|
||||
sasOld.MustStop()
|
||||
@@ -774,7 +818,7 @@ func CheckStreamAggrConfigs() error {
|
||||
if sasFile == "" {
|
||||
continue
|
||||
}
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArgOrDefault(idx, 0)
|
||||
dedupInterval := streamAggrDedupInterval.GetOptionalArg(idx)
|
||||
sas, err := streamaggr.LoadFromFile(sasFile, pushNoop, dedupInterval)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot load -remoteWrite.streamAggr.config=%q: %w", sasFile, err)
|
||||
|
||||
@@ -27,7 +27,7 @@ var (
|
||||
stdDialerOnce sync.Once
|
||||
)
|
||||
|
||||
func statDial(ctx context.Context, networkUnused, addr string) (conn net.Conn, err error) {
|
||||
func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) {
|
||||
network := netutil.GetTCPNetwork()
|
||||
d := getStdDialer()
|
||||
conn, err = d.DialContext(ctx, network, addr)
|
||||
|
||||
103
app/vmalert-tool/Makefile
Normal file
103
app/vmalert-tool/Makefile
Normal file
@@ -0,0 +1,103 @@
|
||||
# All these commands must run from repository root.
|
||||
|
||||
vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-local
|
||||
|
||||
vmalert-tool-race:
|
||||
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
|
||||
|
||||
vmalert-tool-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker
|
||||
|
||||
vmalert-tool-pure-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
|
||||
|
||||
vmalert-tool-linux-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
|
||||
|
||||
vmalert-tool-linux-arm-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
|
||||
|
||||
vmalert-tool-linux-arm64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
|
||||
|
||||
vmalert-tool-linux-ppc64le-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
|
||||
|
||||
vmalert-tool-linux-386-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmalert-tool-darwin-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmalert-tool-darwin-arm64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmalert-tool-freebsd-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
|
||||
|
||||
vmalert-tool-openbsd-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
|
||||
|
||||
vmalert-tool-windows-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker
|
||||
|
||||
package-vmalert-tool-pure:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
|
||||
|
||||
package-vmalert-tool-amd64:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
|
||||
|
||||
package-vmalert-tool-arm:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
|
||||
|
||||
package-vmalert-tool-arm64:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
|
||||
|
||||
package-vmalert-tool-ppc64le:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
|
||||
|
||||
package-vmalert-tool-386:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
|
||||
|
||||
publish-vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
|
||||
|
||||
vmalert-tool-linux-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-arm:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-arm64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-ppc64le:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-s390x:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-386:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-darwin-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-darwin-arm64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-freebsd-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-openbsd-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
|
||||
|
||||
vmalert-tool-pure:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-local-pure
|
||||
246
app/vmalert-tool/README.md
Normal file
246
app/vmalert-tool/README.md
Normal file
@@ -0,0 +1,246 @@
|
||||
|
||||
# vmalert-tool
|
||||
|
||||
VMAlert command-line tool
|
||||
|
||||
## Unit testing for rules
|
||||
|
||||
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
|
||||
It will perform the following actions:
|
||||
* sets up an isolated VictoriaMetrics instance;
|
||||
* simulates the periodic ingestion of time series;
|
||||
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
|
||||
* checks whether the firing alerts or resulting recording rules match the expected results.
|
||||
|
||||
See how to run vmalert-tool for unit test below:
|
||||
|
||||
```
|
||||
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
|
||||
./vmalert-tool unittest --files test1.yaml --files test2.yaml
|
||||
```
|
||||
|
||||
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
|
||||
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
|
||||
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
|
||||
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
|
||||
|
||||
### Test file format
|
||||
|
||||
The configuration format for files specified in `--files` cmd-line flag is the following:
|
||||
|
||||
```yaml
|
||||
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
|
||||
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
|
||||
rule_files:
|
||||
[ - <string> ]
|
||||
|
||||
# The evaluation interval for rules specified in `rule_files`
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
# Groups listed below will be evaluated by order.
|
||||
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
|
||||
group_eval_order:
|
||||
[ - <string> ]
|
||||
|
||||
# The list of unit test files to be checked during evaluation.
|
||||
tests:
|
||||
[ - <test_group> ]
|
||||
```
|
||||
|
||||
#### `<test_group>`
|
||||
|
||||
```yaml
|
||||
# Interval between samples for input series
|
||||
interval: <duration>
|
||||
# Time series to persist into the database according to configured <interval> before running tests.
|
||||
input_series:
|
||||
[ - <series> ]
|
||||
|
||||
# Name of the test group, optional
|
||||
[ name: <string> ]
|
||||
|
||||
# Unit tests for alerting rules
|
||||
alert_rule_test:
|
||||
[ - <alert_test_case> ]
|
||||
|
||||
# Unit tests for Metricsql expressions.
|
||||
metricsql_expr_test:
|
||||
[ - <metricsql_expr_test> ]
|
||||
|
||||
# External labels accessible for templating.
|
||||
external_labels:
|
||||
[ <labelname>: <string> ... ]
|
||||
|
||||
```
|
||||
|
||||
#### `<series>`
|
||||
|
||||
```yaml
|
||||
# series in the following format '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
series: <string>
|
||||
|
||||
# values support several special equations:
|
||||
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
|
||||
# Read this as series starts at a, then c further samples incrementing by b.
|
||||
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
|
||||
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
|
||||
# '_' represents a missing sample from scrape
|
||||
# 'stale' indicates a stale sample
|
||||
# Examples:
|
||||
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
|
||||
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
|
||||
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
|
||||
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
|
||||
values: <string>
|
||||
```
|
||||
|
||||
#### `<alert_test_case>`
|
||||
|
||||
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
|
||||
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
|
||||
but no need to add them under `exp_alerts`.
|
||||
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
|
||||
|
||||
```yaml
|
||||
# The time elapsed from time=0s when this alerting rule should be checked.
|
||||
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
|
||||
eval_time: <duration>
|
||||
|
||||
# Name of the group name to be tested.
|
||||
groupname: <string>
|
||||
|
||||
# Name of the alert to be tested.
|
||||
alertname: <string>
|
||||
|
||||
# List of the expected alerts that are firing under the given alertname at
|
||||
# the given evaluation time. If you want to test if an alerting rule should
|
||||
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
|
||||
exp_alerts:
|
||||
[ - <alert> ]
|
||||
```
|
||||
|
||||
#### `<alert>`
|
||||
|
||||
```yaml
|
||||
# These are the expanded labels and annotations of the expected alert.
|
||||
# Note: labels also include the labels of the sample associated with the alert
|
||||
exp_labels:
|
||||
[ <labelname>: <string> ]
|
||||
exp_annotations:
|
||||
[ <labelname>: <string> ]
|
||||
```
|
||||
|
||||
#### `<metricsql_expr_test>`
|
||||
|
||||
```yaml
|
||||
# Expression to evaluate
|
||||
expr: <string>
|
||||
|
||||
# The time elapsed from time=0s when this expression be evaluated.
|
||||
eval_time: <duration>
|
||||
|
||||
# Expected samples at the given evaluation time.
|
||||
exp_samples:
|
||||
[ - <sample> ]
|
||||
```
|
||||
|
||||
#### `<sample>`
|
||||
|
||||
```yaml
|
||||
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
labels: <string>
|
||||
|
||||
# The expected value of the Metricsql expression.
|
||||
value: <number>
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
This is an example input file for unit testing which will pass.
|
||||
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
|
||||
|
||||
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
|
||||
|
||||
#### `test.yaml`
|
||||
|
||||
```yaml
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="prometheus", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: prometheus
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
```
|
||||
|
||||
#### `alerts.yaml`
|
||||
|
||||
```yaml
|
||||
# This is the rules file.
|
||||
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
```
|
||||
54
app/vmalert-tool/main.go
Normal file
54
app/vmalert-tool/main.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
)
|
||||
|
||||
func main() {
|
||||
start := time.Now()
|
||||
app := &cli.App{
|
||||
Name: "vmalert-tool",
|
||||
Usage: "VMAlert command-line tool",
|
||||
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
|
||||
Version: buildinfo.Version,
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
Name: "unittest",
|
||||
Usage: "Run unittest for alerting and recording rules.",
|
||||
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringSliceFlag{
|
||||
Name: "files",
|
||||
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
|
||||
Required: true,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "disableAlertgroupLabel",
|
||||
Usage: "disable adding group's Name as label to generated alerts and time series.",
|
||||
Required: false,
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
|
||||
return fmt.Errorf("unittest failed")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
err := app.Run(os.Args)
|
||||
if err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
log.Printf("Total time: %v", time.Since(start))
|
||||
}
|
||||
12
app/vmalert-tool/multiarch/Dockerfile
Normal file
12
app/vmalert-tool/multiarch/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
# See https://medium.com/on-docker/use-multi-stage-builds-to-inject-ca-certs-ad1e8f01de1b
|
||||
ARG certs_image
|
||||
ARG root_image
|
||||
FROM $certs_image as certs
|
||||
RUN apk update && apk upgrade && apk --update --no-cache add ca-certificates
|
||||
|
||||
FROM $root_image
|
||||
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||
EXPOSE 8429
|
||||
ENTRYPOINT ["/vmalert-tool-prod"]
|
||||
ARG TARGETARCH
|
||||
COPY vmalert-tool-linux-${TARGETARCH}-prod ./vmalert-tool-prod
|
||||
19
app/vmalert-tool/unittest/alerting.go
Normal file
19
app/vmalert-tool/unittest/alerting.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// alertTestCase holds alert_rule_test cases defined in test file
|
||||
type alertTestCase struct {
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
GroupName string `yaml:"groupname"`
|
||||
Alertname string `yaml:"alertname"`
|
||||
ExpAlerts []expAlert `yaml:"exp_alerts"`
|
||||
}
|
||||
|
||||
// expAlert holds exp_alerts defined in test file
|
||||
type expAlert struct {
|
||||
ExpLabels map[string]string `yaml:"exp_labels"`
|
||||
ExpAnnotations map[string]string `yaml:"exp_annotations"`
|
||||
}
|
||||
182
app/vmalert-tool/unittest/input.go
Normal file
182
app/vmalert-tool/unittest/input.go
Normal file
@@ -0,0 +1,182 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// series holds input_series defined in the test file
|
||||
type series struct {
|
||||
Series string `yaml:"series"`
|
||||
Values string `yaml:"values"`
|
||||
}
|
||||
|
||||
// sequenceValue is an omittable value in a sequence of time series values.
|
||||
type sequenceValue struct {
|
||||
Value float64
|
||||
Omitted bool
|
||||
}
|
||||
|
||||
func httpWrite(address string, r io.Reader) {
|
||||
resp, err := http.Post(address, "", r)
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to send to storage: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// writeInputSeries send input series to vmstorage and flush them
|
||||
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
|
||||
r := testutil.WriteRequest{}
|
||||
for _, data := range input {
|
||||
expr, err := metricsql.Parse(data.Series)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
|
||||
}
|
||||
promvals, err := parseInputValue(data.Values, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
|
||||
}
|
||||
metricExpr, ok := expr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
|
||||
}
|
||||
samples := make([]testutil.Sample, 0, len(promvals))
|
||||
ts := startStamp
|
||||
for _, v := range promvals {
|
||||
if !v.Omitted {
|
||||
samples = append(samples, testutil.Sample{
|
||||
Timestamp: ts.UnixMilli(),
|
||||
Value: v.Value,
|
||||
})
|
||||
}
|
||||
ts = ts.Add(interval.Duration())
|
||||
}
|
||||
var ls []testutil.Label
|
||||
for _, filter := range metricExpr.LabelFilterss[0] {
|
||||
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
|
||||
}
|
||||
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
|
||||
}
|
||||
|
||||
data, err := testutil.Compress(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compress data: %v", err)
|
||||
}
|
||||
// write input series to vm
|
||||
httpWrite(dst, bytes.NewBuffer(data))
|
||||
vmstorage.Storage.DebugFlush()
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
|
||||
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
|
||||
var res []sequenceValue
|
||||
items := strings.Split(input, " ")
|
||||
reg := regexp.MustCompile(`\D?\d*\D?`)
|
||||
for _, item := range items {
|
||||
if item == "stale" {
|
||||
res = append(res, sequenceValue{Value: decimal.StaleNaN})
|
||||
continue
|
||||
}
|
||||
vals := reg.FindAllString(item, -1)
|
||||
switch len(vals) {
|
||||
case 1:
|
||||
if vals[0] == "_" {
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v, err := strconv.ParseFloat(vals[0], 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v})
|
||||
continue
|
||||
case 2:
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v2, err := strconv.ParseInt(vals[1], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
switch option {
|
||||
case '+':
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v1 + float64(v2)})
|
||||
case 'x':
|
||||
for i := int64(0); i <= v2; i++ {
|
||||
if p1 == "_" {
|
||||
if i == 0 {
|
||||
i = 1
|
||||
}
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !origin || v1 == 0 {
|
||||
res = append(res, sequenceValue{Value: v1 * float64(i)})
|
||||
continue
|
||||
}
|
||||
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
|
||||
newRes, err := parseInputValue(newVal, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, newRes...)
|
||||
break
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("got invalid operation %b", option)
|
||||
}
|
||||
case 3:
|
||||
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
var isAdd bool
|
||||
if option == '+' {
|
||||
isAdd = true
|
||||
}
|
||||
for _, r := range r1 {
|
||||
if isAdd {
|
||||
res = append(res, sequenceValue{
|
||||
Value: r.Value + v1,
|
||||
})
|
||||
} else {
|
||||
res = append(res, sequenceValue{
|
||||
Value: v1 - r.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported input %s", input)
|
||||
}
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
93
app/vmalert-tool/unittest/input_test.go
Normal file
93
app/vmalert-tool/unittest/input_test.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
)
|
||||
|
||||
func TestParseInputValue(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
exp []sequenceValue
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
"",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"testfailed",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
// stale doesn't support operations
|
||||
{
|
||||
"stalex3",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"-4",
|
||||
[]sequenceValue{{Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"stale",
|
||||
[]sequenceValue{{Value: decimal.StaleNaN}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"-4x1",
|
||||
[]sequenceValue{{Value: -4}, {Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_x1",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x4",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"2-1x4",
|
||||
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x1 _ -4 stale 3+20x1",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
output, err := parseInputValue(tc.input, true)
|
||||
if err != nil != tc.failed {
|
||||
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
|
||||
}
|
||||
if len(tc.exp) != len(output) {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
for i := 0; i < len(tc.exp); i++ {
|
||||
if tc.exp[i].Omitted != output[i].Omitted {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
if tc.exp[i].Value != output[i].Value {
|
||||
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
|
||||
continue
|
||||
}
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
100
app/vmalert-tool/unittest/recording.go
Normal file
100
app/vmalert-tool/unittest/recording.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
|
||||
type metricsqlTestCase struct {
|
||||
Expr string `yaml:"expr"`
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
ExpSamples []expSample `yaml:"exp_samples"`
|
||||
}
|
||||
|
||||
type expSample struct {
|
||||
Labels string `yaml:"labels"`
|
||||
Value float64 `yaml:"value"`
|
||||
}
|
||||
|
||||
// checkMetricsqlCase will check metricsql_expr_test cases
|
||||
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
|
||||
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
|
||||
Outer:
|
||||
for _, mt := range cases {
|
||||
result, _, err := queries.Query(context.Background(), mt.Expr, durationToTime(mt.EvalTime))
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), err))
|
||||
continue
|
||||
}
|
||||
var gotSamples []parsedSample
|
||||
for _, s := range result.Data {
|
||||
sort.Slice(s.Labels, func(i, j int) bool {
|
||||
return s.Labels[i].Name < s.Labels[j].Name
|
||||
})
|
||||
gotSamples = append(gotSamples, parsedSample{
|
||||
Labels: s.Labels,
|
||||
Value: s.Values[0],
|
||||
})
|
||||
}
|
||||
var expSamples []parsedSample
|
||||
for _, s := range mt.ExpSamples {
|
||||
expLb := datasource.Labels{}
|
||||
if s.Labels != "" {
|
||||
metricsqlExpr, err := metricsql.Parse(s.Labels)
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
|
||||
continue Outer
|
||||
}
|
||||
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
|
||||
continue Outer
|
||||
}
|
||||
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
|
||||
expLb = append(expLb, datasource.Label{
|
||||
Name: l.Label,
|
||||
Value: l.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
sort.Slice(expLb, func(i, j int) bool {
|
||||
return expLb[i].Name < expLb[j].Name
|
||||
})
|
||||
expSamples = append(expSamples, parsedSample{
|
||||
Labels: expLb,
|
||||
Value: s.Value,
|
||||
})
|
||||
}
|
||||
sort.Slice(expSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
|
||||
})
|
||||
sort.Slice(gotSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
|
||||
})
|
||||
if !reflect.DeepEqual(expSamples, gotSamples) {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
|
||||
}
|
||||
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func durationToTime(pd *promutils.Duration) time.Time {
|
||||
if pd == nil {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.UnixMilli(pd.Duration().Milliseconds())
|
||||
}
|
||||
43
app/vmalert-tool/unittest/testdata/disable-group-label.yaml
vendored
Normal file
43
app/vmalert-tool/unittest/testdata/disable-group-label.yaml
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
49
app/vmalert-tool/unittest/testdata/failed-test.yaml
vendored
Normal file
49
app/vmalert-tool/unittest/testdata/failed-test.yaml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "Failing test"
|
||||
input_series:
|
||||
- series: test
|
||||
values: "0"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
labels: test
|
||||
|
||||
# will failed cause there is no "Test" group and rule defined
|
||||
alert_rule_test:
|
||||
- eval_time: 0m
|
||||
groupname: Test
|
||||
alertname: Test
|
||||
exp_alerts:
|
||||
- exp_labels: {}
|
||||
|
||||
- interval: 1m
|
||||
name: Failing alert test
|
||||
input_series:
|
||||
- series: 'up{job="test"}'
|
||||
values: 0x10
|
||||
|
||||
alert_rule_test:
|
||||
# will failed cause rule is firing
|
||||
- eval_time: 5m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
- interval: 1m
|
||||
name: Failing alert test with missing groupname
|
||||
input_series:
|
||||
- series: 'up{job="test"}'
|
||||
values: 0x10
|
||||
|
||||
alert_rule_test:
|
||||
# will failed cause missing groupname
|
||||
- eval_time: 5m
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts: []
|
||||
30
app/vmalert-tool/unittest/testdata/long-period.yaml
vendored
Normal file
30
app/vmalert-tool/unittest/testdata/long-period.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
# can be executed successfully but will take more than 1 minute
|
||||
# not included in unit test now
|
||||
evaluation_interval: 100d
|
||||
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1d
|
||||
input_series:
|
||||
- series: test
|
||||
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
|
||||
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
|
||||
# We input series to 2024-01-01T00:00:00 here.
|
||||
values: "0+1x19723"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: timestamp(test)
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
- expr: test
|
||||
eval_time: 100d
|
||||
exp_samples:
|
||||
- labels: test
|
||||
value: 100
|
||||
- expr: timestamp(test)
|
||||
eval_time: 19000d
|
||||
exp_samples:
|
||||
- value: 1641600000 # 19000d -> seconds.
|
||||
39
app/vmalert-tool/unittest/testdata/rules.yaml
vendored
Normal file
39
app/vmalert-tool/unittest/testdata/rules.yaml
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 1m
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: t1
|
||||
expr: test
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 5m
|
||||
|
||||
- name: group3
|
||||
rules:
|
||||
- record: t2
|
||||
expr: t1
|
||||
|
||||
- name: group4
|
||||
rules:
|
||||
- record: t3
|
||||
expr: t1
|
||||
99
app/vmalert-tool/unittest/testdata/test1.yaml
vendored
Normal file
99
app/vmalert-tool/unittest/testdata/test1.yaml
vendored
Normal file
@@ -0,0 +1,99 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
group_eval_order: ["group4", "group2", "group3"]
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "basic test"
|
||||
input_series:
|
||||
- series: "test"
|
||||
values: "_x5 1x5 _ stale"
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 1m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts:
|
||||
- {}
|
||||
- eval_time: 1m
|
||||
groupname: group2
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
- eval_time: 6m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 11m
|
||||
exp_samples:
|
||||
- labels: '{__name__="test"}'
|
||||
value: 1
|
||||
- expr: test
|
||||
eval_time: 12m
|
||||
exp_samples: []
|
||||
|
||||
- interval: 1m
|
||||
name: "basic test2"
|
||||
input_series:
|
||||
- series: 'up{job="vmagent1", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
- series: "test"
|
||||
values: "0+1x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
|
||||
value: 1
|
||||
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
|
||||
value: 1
|
||||
- expr: t1
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t1", datacenter="dc-123"}'
|
||||
- expr: t2
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t2", datacenter="dc-123"}'
|
||||
- expr: t3
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
|
||||
- value: 3
|
||||
labels: '{__name__="t3", datacenter="dc-123"}'
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent1
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: alerts
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
46
app/vmalert-tool/unittest/testdata/test2.yaml
vendored
Normal file
46
app/vmalert-tool/unittest/testdata/test2.yaml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
83
app/vmalert-tool/unittest/type.go
Normal file
83
app/vmalert-tool/unittest/type.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
// parsedSample is a sample with parsed Labels
|
||||
type parsedSample struct {
|
||||
Labels datasource.Labels
|
||||
Value float64
|
||||
}
|
||||
|
||||
func (ps *parsedSample) String() string {
|
||||
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
|
||||
}
|
||||
|
||||
func parsedSamplesString(pss []parsedSample) string {
|
||||
if len(pss) == 0 {
|
||||
return "nil"
|
||||
}
|
||||
s := pss[0].String()
|
||||
for _, ps := range pss[1:] {
|
||||
s += ", " + ps.String()
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// labelAndAnnotation holds labels and annotations
|
||||
type labelAndAnnotation struct {
|
||||
Labels datasource.Labels
|
||||
Annotations datasource.Labels
|
||||
}
|
||||
|
||||
func (la *labelAndAnnotation) String() string {
|
||||
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
|
||||
}
|
||||
|
||||
// labelsAndAnnotations is collection of LabelAndAnnotation
|
||||
type labelsAndAnnotations []labelAndAnnotation
|
||||
|
||||
func (la labelsAndAnnotations) Len() int { return len(la) }
|
||||
|
||||
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
|
||||
func (la labelsAndAnnotations) Less(i, j int) bool {
|
||||
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
|
||||
if diff != 0 {
|
||||
return diff < 0
|
||||
}
|
||||
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
|
||||
}
|
||||
|
||||
func (la labelsAndAnnotations) String() string {
|
||||
if len(la) == 0 {
|
||||
return "[]"
|
||||
}
|
||||
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
|
||||
for i, l := range la[1:] {
|
||||
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
|
||||
}
|
||||
s += "\n]"
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// indentLines prefixes each line in the supplied string with the given "indent" string.
|
||||
func indentLines(lines, indent string) string {
|
||||
sb := strings.Builder{}
|
||||
n := strings.Split(lines, "\n")
|
||||
for i, l := range n {
|
||||
if i > 0 {
|
||||
sb.WriteString(indent)
|
||||
}
|
||||
sb.WriteString(l)
|
||||
if i != len(n)-1 {
|
||||
sb.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
443
app/vmalert-tool/unittest/unittest.go
Normal file
443
app/vmalert-tool/unittest/unittest.go
Normal file
@@ -0,0 +1,443 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
storagePath string
|
||||
httpListenAddr = ":8880"
|
||||
// insert series from 1970-01-01T00:00:00
|
||||
testStartTime = time.Unix(0, 0).UTC()
|
||||
|
||||
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
|
||||
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
|
||||
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
|
||||
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
|
||||
|
||||
disableAlertgroupLabel bool
|
||||
)
|
||||
|
||||
const (
|
||||
testStoragePath = "vmalert-unittest"
|
||||
testLogLevel = "ERROR"
|
||||
)
|
||||
|
||||
// UnitTest runs unittest for files
|
||||
func UnitTest(files []string, disableGroupLabel bool) bool {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
logger.Fatalf("failed to load template: %v", err)
|
||||
}
|
||||
storagePath = filepath.Join(os.TempDir(), testStoragePath)
|
||||
processFlags()
|
||||
vminsert.Init()
|
||||
vmselect.Init()
|
||||
// storagePath will be created again when closing vmselect, so remove it again.
|
||||
defer fs.MustRemoveAll(storagePath)
|
||||
defer vminsert.Stop()
|
||||
defer vmselect.Stop()
|
||||
disableAlertgroupLabel = disableGroupLabel
|
||||
return rulesUnitTest(files)
|
||||
}
|
||||
|
||||
func rulesUnitTest(files []string) bool {
|
||||
var failed bool
|
||||
for _, f := range files {
|
||||
if err := ruleUnitTest(f); err != nil {
|
||||
fmt.Println(" FAILED")
|
||||
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
|
||||
failed = true
|
||||
} else {
|
||||
fmt.Println(" SUCCESS")
|
||||
}
|
||||
}
|
||||
return failed
|
||||
}
|
||||
|
||||
func ruleUnitTest(filename string) []error {
|
||||
fmt.Println("\nUnit Testing: ", filename)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to read file: %w", err)}
|
||||
}
|
||||
|
||||
var unitTestInp unitTestFile
|
||||
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
|
||||
}
|
||||
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
if unitTestInp.EvaluationInterval.Duration() == 0 {
|
||||
fmt.Println("evaluation_interval set to 1m by default")
|
||||
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
|
||||
}
|
||||
|
||||
groupOrderMap := make(map[string]int)
|
||||
for i, gn := range unitTestInp.GroupEvalOrder {
|
||||
if _, ok := groupOrderMap[gn]; ok {
|
||||
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
|
||||
}
|
||||
groupOrderMap[gn] = i
|
||||
}
|
||||
|
||||
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
var errs []error
|
||||
for _, t := range unitTestInp.Tests {
|
||||
if err := verifyTestGroup(t); err != nil {
|
||||
errs = append(errs, err)
|
||||
continue
|
||||
}
|
||||
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
|
||||
errs = append(errs, testErrs...)
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyTestGroup(group testGroup) error {
|
||||
var testGroupName string
|
||||
if group.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
|
||||
}
|
||||
for _, at := range group.AlertRuleTests {
|
||||
if at.Alertname == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
|
||||
}
|
||||
if !disableAlertgroupLabel && at.GroupName == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
|
||||
}
|
||||
if disableAlertgroupLabel && at.GroupName != "" {
|
||||
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
|
||||
}
|
||||
if at.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
for _, et := range group.MetricsqlExprTests {
|
||||
if et.Expr == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
|
||||
}
|
||||
if et.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func processFlags() {
|
||||
flag.Parse()
|
||||
for _, fv := range []struct {
|
||||
flag string
|
||||
value string
|
||||
}{
|
||||
{flag: "storageDataPath", value: storagePath},
|
||||
{flag: "loggerLevel", value: testLogLevel},
|
||||
{flag: "search.disableCache", value: "true"},
|
||||
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
|
||||
{flag: "retentionPeriod", value: "100y"},
|
||||
{flag: "datasource.url", value: testDataSourcePath},
|
||||
{flag: "remoteWrite.url", value: testRemoteWritePath},
|
||||
} {
|
||||
// panics if flag doesn't exist
|
||||
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
|
||||
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func setUp() {
|
||||
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
|
||||
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
|
||||
switch r.URL.Path {
|
||||
case "/prometheus/api/v1/query":
|
||||
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
case "/prometheus/api/v1/write", "/api/v1/write":
|
||||
if err := promremotewrite.InsertHandler(r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
default:
|
||||
}
|
||||
return false
|
||||
})
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
readyCheckFunc := func() bool {
|
||||
resp, err := http.Get(testHealthHTTPPath)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
return resp.StatusCode == 200
|
||||
}
|
||||
checkCheck:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Fatalf("http server can't be ready in 30s")
|
||||
default:
|
||||
if readyCheckFunc() {
|
||||
break checkCheck
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tearDown() {
|
||||
if err := httpserver.Stop(httpListenAddr); err != nil {
|
||||
logger.Errorf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
vmstorage.Stop()
|
||||
metrics.UnregisterAllMetrics()
|
||||
fs.MustRemoveAll(storagePath)
|
||||
}
|
||||
|
||||
// resolveAndGlobFilepaths joins all relative paths in a configuration
|
||||
// with a given base directory and replaces all globs with matching files.
|
||||
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
|
||||
for i, rf := range utf.RuleFiles {
|
||||
if rf != "" && !filepath.IsAbs(rf) {
|
||||
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
|
||||
}
|
||||
}
|
||||
|
||||
var globbedFiles []string
|
||||
for _, rf := range utf.RuleFiles {
|
||||
m, err := filepath.Glob(rf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(m) == 0 {
|
||||
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
|
||||
}
|
||||
globbedFiles = append(globbedFiles, m...)
|
||||
}
|
||||
utf.RuleFiles = globbedFiles
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
|
||||
// set up vmstorage and http server for ingest and read queries
|
||||
setUp()
|
||||
// tear down vmstorage and clean the data dir
|
||||
defer tearDown()
|
||||
|
||||
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
|
||||
if err != nil {
|
||||
return []error{err}
|
||||
}
|
||||
|
||||
q, err := datasource.Init(nil)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init datasource: %v", err)}
|
||||
}
|
||||
rw, err := remotewrite.NewDebugClient()
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init wr: %v", err)}
|
||||
}
|
||||
|
||||
alertEvalTimesMap := map[time.Duration]struct{}{}
|
||||
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
|
||||
for _, at := range tg.AlertRuleTests {
|
||||
et := at.EvalTime.Duration()
|
||||
alertEvalTimesMap[et] = struct{}{}
|
||||
if _, ok := alertExpResultMap[et]; !ok {
|
||||
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
|
||||
}
|
||||
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
|
||||
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
|
||||
}
|
||||
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
|
||||
}
|
||||
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
|
||||
for k := range alertEvalTimesMap {
|
||||
alertEvalTimes = append(alertEvalTimes, k)
|
||||
}
|
||||
sort.Slice(alertEvalTimes, func(i, j int) bool {
|
||||
return alertEvalTimes[i] < alertEvalTimes[j]
|
||||
})
|
||||
|
||||
// sort group eval order according to the given "group_eval_order".
|
||||
sort.Slice(testGroups, func(i, j int) bool {
|
||||
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
|
||||
})
|
||||
|
||||
// create groups with given rule
|
||||
var groups []*rule.Group
|
||||
for _, group := range testGroups {
|
||||
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
|
||||
groups = append(groups, ng)
|
||||
}
|
||||
|
||||
evalIndex := 0
|
||||
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
|
||||
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
|
||||
for _, g := range groups {
|
||||
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
|
||||
ts, err))
|
||||
}
|
||||
}
|
||||
// flush series after each group evaluation
|
||||
vmstorage.Storage.DebugFlush()
|
||||
}
|
||||
|
||||
// check alert_rule_test case at every eval time
|
||||
for evalIndex < len(alertEvalTimes) {
|
||||
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
|
||||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
|
||||
break
|
||||
}
|
||||
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
|
||||
for _, g := range groups {
|
||||
if disableAlertgroupLabel {
|
||||
g.Name = ""
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, ok := gotAlertsMap[g.Name]; !ok {
|
||||
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
ar, isAlertRule := r.(*rule.AlertingRule)
|
||||
if !isAlertRule {
|
||||
continue
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
|
||||
for _, got := range ar.GetAlerts() {
|
||||
if got.State != notifier.StateFiring {
|
||||
continue
|
||||
}
|
||||
if disableAlertgroupLabel {
|
||||
delete(got.Labels, "alertgroup")
|
||||
}
|
||||
laa := labelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(got.Labels),
|
||||
Annotations: datasource.ConvertToLabels(got.Annotations),
|
||||
}
|
||||
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
|
||||
for alertname, res := range gres {
|
||||
var expAlerts labelsAndAnnotations
|
||||
for _, expAlert := range res {
|
||||
if expAlert.ExpLabels == nil {
|
||||
expAlert.ExpLabels = make(map[string]string)
|
||||
}
|
||||
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
|
||||
if !disableAlertgroupLabel {
|
||||
expAlert.ExpLabels["alertgroup"] = groupname
|
||||
}
|
||||
// alertNameLabel is added as additional labels in vmalert.
|
||||
expAlert.ExpLabels["alertname"] = alertname
|
||||
expAlerts = append(expAlerts, labelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
|
||||
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
|
||||
})
|
||||
}
|
||||
sort.Sort(expAlerts)
|
||||
|
||||
gotAlerts := gotAlertsMap[groupname][alertname]
|
||||
sort.Sort(gotAlerts)
|
||||
if !reflect.DeepEqual(expAlerts, gotAlerts) {
|
||||
var testGroupName string
|
||||
if tg.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
|
||||
}
|
||||
expString := indentLines(expAlerts.String(), " ")
|
||||
gotString := indentLines(gotAlerts.String(), " ")
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
|
||||
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
|
||||
}
|
||||
}
|
||||
}
|
||||
evalIndex++
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
|
||||
return checkErrs
|
||||
}
|
||||
|
||||
// unitTestFile holds the contents of a single unit test file
|
||||
type unitTestFile struct {
|
||||
RuleFiles []string `yaml:"rule_files"`
|
||||
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
|
||||
GroupEvalOrder []string `yaml:"group_eval_order"`
|
||||
Tests []testGroup `yaml:"tests"`
|
||||
}
|
||||
|
||||
// testGroup is a group of input series and test cases associated with it
|
||||
type testGroup struct {
|
||||
Interval *promutils.Duration `yaml:"interval"`
|
||||
InputSeries []series `yaml:"input_series"`
|
||||
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
|
||||
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
|
||||
ExternalLabels map[string]string `yaml:"external_labels"`
|
||||
TestGroupName string `yaml:"name"`
|
||||
}
|
||||
|
||||
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
|
||||
func (tg *testGroup) maxEvalTime() time.Duration {
|
||||
var maxd time.Duration
|
||||
for _, alert := range tg.AlertRuleTests {
|
||||
if alert.EvalTime.Duration() > maxd {
|
||||
maxd = alert.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
for _, met := range tg.MetricsqlExprTests {
|
||||
if met.EvalTime.Duration() > maxd {
|
||||
maxd = met.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
return maxd
|
||||
}
|
||||
47
app/vmalert-tool/unittest/unittest_test.go
Normal file
47
app/vmalert-tool/unittest/unittest_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package unittest
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestUnitRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
disableGroupLabel bool
|
||||
files []string
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
name: "run multi files",
|
||||
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "disable group label",
|
||||
disableGroupLabel: true,
|
||||
files: []string{"./testdata/disable-group-label.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "failing test",
|
||||
files: []string{"./testdata/failed-test.yaml"},
|
||||
failed: true,
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
fail := UnitTest(tc.files, tc.disableGroupLabel)
|
||||
if fail != tc.failed {
|
||||
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -112,18 +112,41 @@ name: <string>
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = -evaluationInterval flag ]
|
||||
|
||||
# Optional
|
||||
# Group will be evaluated at the exact offset in the range of [0...interval].
|
||||
# E.g. for Group with `interval: 1h` and `eval_offset: 5m` the evaluation will
|
||||
# start at 5th minute of the hour. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409
|
||||
# `eval_offset` can't be bigger than `interval`.
|
||||
[ eval_offset: <duration> ]
|
||||
|
||||
# Optional
|
||||
# Adjust the `time` parameter of group evaluation requests to compensate intentional query delay from the datasource.
|
||||
# By default, the value is inherited from the `-rule.evalDelay` cmd-line flag - see its description for details.
|
||||
# If group has `latency_offset` set in `params`, then it is recommended to set `eval_delay` equal to `latency_offset`.
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155 and https://docs.victoriametrics.com/keyConcepts.html#query-latency.
|
||||
[ eval_delay: <duration> ]
|
||||
|
||||
# Limit the number of alerts an alerting rule and series a recording
|
||||
# rule can produce. 0 is no limit.
|
||||
[ limit: <int> | default = 0 ]
|
||||
|
||||
# How many rules execute at once within a group. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
# up group's evaluation duration (exposed via `vmalert_iteration_duration_seconds` metric).
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
# Optional type for expressions inside the rules. Supported values: "graphite" and "prometheus".
|
||||
# By default "prometheus" type is used.
|
||||
# By default, "prometheus" type is used.
|
||||
[ type: <string> ]
|
||||
|
||||
# Optional
|
||||
# The evaluation timestamp will be aligned with group's interval,
|
||||
# instead of using the actual timestamp that evaluation happens at.
|
||||
# By default, it's enabled to get more predictable results
|
||||
# and to visually align with results plotted via Grafana or vmui.
|
||||
# See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049
|
||||
# Available starting from v1.95
|
||||
[ eval_alignment: <bool> | default true]
|
||||
|
||||
# Optional list of HTTP URL parameters
|
||||
# applied for all rules requests within a group
|
||||
# For example:
|
||||
@@ -423,7 +446,7 @@ If `-clusterMode` is enabled and the `tenant` in a particular group is missing,
|
||||
is obtained from `-defaultTenant.prometheus` or `-defaultTenant.graphite` depending on the `type` of the group.
|
||||
|
||||
The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files
|
||||
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise`
|
||||
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) and in `*-enterprise`
|
||||
tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
|
||||
|
||||
### Reading rules from object storage
|
||||
@@ -519,14 +542,20 @@ Alertmanagers.
|
||||
|
||||
To avoid recording rules results and alerts state duplication in VictoriaMetrics server
|
||||
don't forget to configure [deduplication](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).
|
||||
The recommended value for `-dedup.minScrapeInterval` must be multiple of vmalert's `evaluation_interval`.
|
||||
If you observe inconsistent or "jumping" values in series produced by vmalert, try disabling `-datasource.queryTimeAlignment`
|
||||
command line flag. Because of alignment, two or more vmalert HA pairs will produce results with the same timestamps.
|
||||
But due of backfilling (data delivered to the datasource with some delay) values of such results may differ,
|
||||
which would affect deduplication logic and result into "jumping" datapoints.
|
||||
Multiple equally configured vmalerts should evaluate rules at the same timestamps, so it is recommended
|
||||
to set `-dedup.minScrapeInterval` as equal to vmalert's `-evaluationInterval`.
|
||||
|
||||
If you have multiple different `interval` params for distinct rule groups, then set `-dedup.minScrapeInterval` to
|
||||
the biggest `interval` value, or value which will be a multiple for all `interval` values. For example, if you have
|
||||
two groups with `interval: 10s` and `interval: 15s`, then set `-dedup.minScrapeInterval=30s`. This would consistently
|
||||
keep only a single data point on 30s time interval for all rules. However, try to avoid having inconsistent `interval`
|
||||
values.
|
||||
|
||||
It is not recommended having `-dedup.minScrapeInterval` smaller than `-evaluationInterval`, as it may produce
|
||||
results with inconsistent intervals between data points.
|
||||
|
||||
Alertmanager will automatically deduplicate alerts with identical labels, so ensure that
|
||||
all `vmalert`s are having the same config.
|
||||
all `vmalert`s are having identical config.
|
||||
|
||||
Don't forget to configure [cluster mode](https://prometheus.io/docs/alerting/latest/alertmanager/)
|
||||
for Alertmanagers for better reliability. List all Alertmanager URLs in vmalert `-notifier.url`
|
||||
@@ -742,6 +771,11 @@ See full description for these flags in `./vmalert -help`.
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
|
||||
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
@@ -771,15 +805,13 @@ may get empty response from the datasource and produce empty recording rules or
|
||||
|
||||
Try the following recommendations to reduce the chance of hitting the data delay issue:
|
||||
|
||||
* Always configure group's `evaluationInterval` to be bigger or at least equal to
|
||||
* Always configure group's `-evaluationInterval` to be bigger or at least equal to
|
||||
[time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution);
|
||||
* Ensure that `[duration]` value is at least twice bigger than
|
||||
[time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution). For example,
|
||||
if expression is `rate(my_metric[2m]) > 0` then ensure that `my_metric` resolution is at least `1m` or better `30s`.
|
||||
If you use VictoriaMetrics as datasource, `[duration]` can be omitted and VictoriaMetrics will adjust it automatically.
|
||||
* If you know in advance, that data in datasource is delayed - try changing vmalerts `-datasource.lookback`
|
||||
command-line flag to add a time shift for evaluations. Or extend `[duration]` to tolerate the delay.
|
||||
For example, `max_over_time(errors_total[10m]) > 0` will be active even if there is no data in datasource for last `9m`.
|
||||
* Extend `[duration]` in expr to help tolerate the delay. For example, `max_over_time(errors_total[10m]) > 0` will be active even if there is no data in datasource for last `9m`.
|
||||
* If [time series resolution](https://docs.victoriametrics.com/keyConcepts.html#time-series-resolution)
|
||||
in datasource is inconsistent or `>=5min` - try changing vmalerts `-datasource.queryStep` command-line flag to specify
|
||||
how far search query can lookback for the recent datapoint. The recommendation is to have the step
|
||||
@@ -787,9 +819,12 @@ at least two times bigger than the resolution.
|
||||
|
||||
> Please note, data delay is inevitable in distributed systems. And it is better to account for it instead of ignoring.
|
||||
|
||||
By default, recently written samples to VictoriaMetrics aren't visible for queries for up to 30s
|
||||
(see `-search.latencyOffset` command-line flag at vmselect). Such delay is needed to eliminate risk of incomplete
|
||||
data on the moment of querying, since metrics collectors won't be able to deliver the data in time.
|
||||
By default, recently written samples to VictoriaMetrics [aren't visible for queries](https://docs.victoriametrics.com/keyConcepts.html#query-latency)
|
||||
for up to 30s (see `-search.latencyOffset` command-line flag at vmselect or VictoriaMetrics single-node). Such delay is needed to eliminate risk of
|
||||
incomplete data on the moment of querying, due to chance that metrics collectors won't be able to deliver that data in time.
|
||||
To compensate the latency in timestamps for produced evaluation results, `-rule.evalDelay` is also set to `30s` by default.
|
||||
If you expect data to be delayed for longer intervals (it gets buffered, queued, or just network is slow sometimes)
|
||||
- consider increasing the `-rule.evalDelay` value accordingly.
|
||||
|
||||
### Alerts state
|
||||
|
||||
@@ -813,7 +848,8 @@ and check the `Last updates` section:
|
||||
Rows in the section represent ordered rule evaluations and their results. The column `curl` contains an example of
|
||||
HTTP request sent by vmalert to the `-datasource.url` during evaluation. If specific state shows that there were
|
||||
no samples returned and curl command returns data - then it is very likely there was no data in datasource on the
|
||||
moment when rule was evaluated.
|
||||
moment when rule was evaluated. Sensitive info is stripped from the `curl` examples - see [security](#security) section
|
||||
for more details.
|
||||
|
||||
### Debug mode
|
||||
|
||||
@@ -829,6 +865,8 @@ Just set `debug: true` in rule's configuration and vmalert will start printing a
|
||||
2022-09-15T13:36:56.153Z DEBUG rule "TestGroup":"Conns" (2601299393013563564) at 2022-09-15T15:36:56+02:00: alert 10705778000901301787 {alertgroup="TestGroup",alertname="Conns",cluster="east-1",instance="localhost:8429",replica="a"} PENDING => FIRING: 1m0s since becoming active at 2022-09-15 15:35:56.126006 +0200 CEST m=+39.384575417
|
||||
```
|
||||
|
||||
Sensitive info is stripped from the `curl` examples - see [security](#security) section for more details.
|
||||
|
||||
### Never-firing alerts
|
||||
|
||||
vmalert can detect if alert's expression doesn't match any time series in runtime
|
||||
@@ -873,6 +911,20 @@ The same issue can be caused by collision of configured `labels` on [Group](#gro
|
||||
To fix it one should avoid collisions by carefully picking label overrides in configuration.
|
||||
|
||||
|
||||
## Security
|
||||
|
||||
See general recommendations regarding security [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#security).
|
||||
|
||||
vmalert [web UI](#web) exposes configuration details such as list of [Groups](#groups), active alerts,
|
||||
[alerts state](#alerts-state), [notifiers](#notifier-configuration-file). Notifier addresses (sanitized) are attached
|
||||
as labels to metrics `vmalert_alerts_sent_.*` on `http://<vmalert>/metrics` page. Consider limiting user's access
|
||||
to the web UI or `/metrics` page if this information is sensitive.
|
||||
|
||||
[Alerts state](#alerts-state) page or [debug mode](#debug-mode) could emit additional information about configured
|
||||
datasource URL, GET params and headers. Sensitive information such as passwords or auth tokens is stripped by default.
|
||||
To disable stripping of such info pass `-datasource.showURL` cmd-line flag to vmalert.
|
||||
|
||||
|
||||
## Profiling
|
||||
|
||||
`vmalert` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
@@ -913,7 +965,7 @@ The shortlist of configuration flags is the following:
|
||||
{% raw %}
|
||||
```console
|
||||
-clusterMode
|
||||
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
If clusterMode is enabled, then vmalert automatically adds the tenant specified in config groups to -datasource.url, -remoteWrite.url and -remoteRead.url. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' or '-notifier.config' files. By default, the checking is disabled. Send SIGHUP signal in order to force config check for changes.
|
||||
-datasource.appendTypePrefix
|
||||
@@ -935,7 +987,7 @@ The shortlist of configuration flags is the following:
|
||||
-datasource.headers string
|
||||
Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. For example, -datasource.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -datasource.url. Multiple headers must be delimited by '^^': -datasource.headers='header1:value1^^header2:value2'
|
||||
-datasource.lookback duration
|
||||
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
Will be deprecated soon, please adjust "-search.latencyOffset" at datasource side or specify "latency_offset" in rule group's params. Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
-datasource.maxIdleConnections int
|
||||
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
|
||||
-datasource.oauth2.clientID string
|
||||
@@ -951,11 +1003,11 @@ The shortlist of configuration flags is the following:
|
||||
-datasource.queryStep duration
|
||||
How far a value can fallback to when evaluating queries. For example, if -datasource.queryStep=15s then param "step" with value "15s" will be added to every query. If set to 0, rule's evaluation interval will be used instead. (default 5m0s)
|
||||
-datasource.queryTimeAlignment
|
||||
Whether to align "time" parameter with evaluation interval.Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
|
||||
Deprecated: please use "eval_alignment" in rule group instead. Whether to align "time" parameter with evaluation interval. Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257 (default true)
|
||||
-datasource.roundDigits int
|
||||
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
|
||||
-datasource.showURL
|
||||
Whether to show -datasource.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
-datasource.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
|
||||
-datasource.tlsCertFile string
|
||||
@@ -969,9 +1021,9 @@ The shortlist of configuration flags is the following:
|
||||
-datasource.url string
|
||||
Datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect URL. Required parameter. E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend and -datasource.showURL
|
||||
-defaultTenant.graphite string
|
||||
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy .This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-defaultTenant.prometheus string
|
||||
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Default tenant for Prometheus alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-disableAlertgroupLabel
|
||||
Whether to disable adding group's Name as label to generated alerts and time series.
|
||||
-dryRun
|
||||
@@ -983,7 +1035,7 @@ The shortlist of configuration flags is the following:
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Deprecated, please use -license or -licenseFile flags instead. By specifying this flag, you confirm that you have an enterprise license and accept the ESA https://victoriametrics.com/legal/esa/ . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.alert.source string
|
||||
@@ -993,6 +1045,8 @@ The shortlist of configuration flags is the following:
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.
|
||||
-filestream.disableFadvise
|
||||
Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU
|
||||
-flagsAuthKey string
|
||||
Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-fs.disableMmap
|
||||
@@ -1001,6 +1055,12 @@ The shortlist of configuration flags is the following:
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth
|
||||
-http.header.csp string
|
||||
Value for 'Content-Security-Policy' header
|
||||
-http.header.frameOptions string
|
||||
Value for 'X-Frame-Options' header
|
||||
-http.header.hsts string
|
||||
Value for 'Strict-Transport-Security' header
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
@@ -1023,6 +1083,12 @@ The shortlist of configuration flags is the following:
|
||||
Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen
|
||||
-internStringMaxLen int
|
||||
The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500)
|
||||
-license string
|
||||
Lisense key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed via file specified by -licenseFile command-line flag
|
||||
-license.forceOffline
|
||||
Whether to enable offline verification for VictoriaMetrics Enterprise license key, which has been passed either via -license or via -licenseFile command-line flag. The issued license key must support offline verification feature. Contact info@victoriametrics.com if you need offline license verification. This flag is avilable only in Enterprise binaries
|
||||
-licenseFile string
|
||||
Path to file with license key for VictoriaMetrics Enterprise. See https://victoriametrics.com/products/enterprise/ . Trial Enterprise license can be obtained from https://victoriametrics.com/products/enterprise/trial/ . This flag is available only in Enterprise binaries. The license key can be also passed inline via -license command-line flag
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
@@ -1061,6 +1127,8 @@ The shortlist of configuration flags is the following:
|
||||
-notifier.bearerTokenFile array
|
||||
Optional path to bearer token file for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.blackhole -notifier.url
|
||||
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). -notifier.url, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
|
||||
-notifier.config string
|
||||
Path to configuration file for notifiers
|
||||
-notifier.oauth2.clientID array
|
||||
@@ -1078,6 +1146,8 @@ The shortlist of configuration flags is the following:
|
||||
-notifier.oauth2.tokenUrl array
|
||||
Optional OAuth2 tokenURL to use for -notifier.url. If multiple args are set, then they are applied independently for the corresponding -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.showURL
|
||||
Whether to avoid stripping sensitive information such as passwords from URL in log messages or UI for -notifier.url. It is hidden by default, since it can contain sensitive info such as auth key
|
||||
-notifier.suppressDuplicateTargetErrors
|
||||
Whether to suppress 'duplicate target' errors during discovery
|
||||
-notifier.tlsCAFile array
|
||||
@@ -1098,8 +1168,6 @@ The shortlist of configuration flags is the following:
|
||||
-notifier.url array
|
||||
Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.blackhole bool
|
||||
Whether to blackhole alerting notifications. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). `-notifier.url`, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings
|
||||
-promscrape.consul.waitTime duration
|
||||
@@ -1173,7 +1241,7 @@ The shortlist of configuration flags is the following:
|
||||
-remoteWrite.bearerTokenFile string
|
||||
Optional path to bearer token file to use for -remoteWrite.url.
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of writers for concurrent writing into remote querier (default 1)
|
||||
Defines number of writers for concurrent writing into remote write endpoint (default 1)
|
||||
-remoteWrite.disablePathAppend
|
||||
Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.
|
||||
-remoteWrite.flushInterval duration
|
||||
@@ -1243,13 +1311,14 @@ The shortlist of configuration flags is the following:
|
||||
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
|
||||
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-rule.evalDelay time
|
||||
Adjustment of the time parameter for rule evaluation requests to compensate intentional data delay from the datasource.Normally, should be equal to `-search.latencyOffset` (cmd-line flag configured for VictoriaMetrics single-node or vmselect). (default 30s)
|
||||
-rule.maxResolveDuration duration
|
||||
Limits the maximum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group.
|
||||
Limits the maxiMum duration for automatic alert expiration, which by default is 4 times evaluationInterval of the parent group
|
||||
-rule.resendDelay duration
|
||||
Minimum amount of time to wait before resending an alert to notifier
|
||||
MiniMum amount of time to wait before resending an alert to notifier
|
||||
-rule.templates array
|
||||
Path or glob pattern to location with go template definitions
|
||||
for rules annotations templating. Flag can be specified multiple times.
|
||||
Path or glob pattern to location with go template definitions for rules annotations templating. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule.templates="/path/to/file". Path to a single file with go templates
|
||||
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
|
||||
@@ -1263,22 +1332,18 @@ The shortlist of configuration flags is the following:
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Whether to validate annotation and label templates (default true)
|
||||
-s2a_enable_appengine_dialer
|
||||
If true, opportunistically use AppEngine-specific dialer to call S2A.
|
||||
-s2a_timeout duration
|
||||
Timeout enforced on the connection to the S2A service for handshake. (default 3s)
|
||||
-s3.configFilePath string
|
||||
Path to file with S3 configs. Configs are loaded from default location if not set.
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-s3.configProfile string
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-s3.credsFilePath string
|
||||
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-s3.customEndpoint string
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html
|
||||
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html
|
||||
-s3.forcePathStyle
|
||||
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise.html (default true)
|
||||
Prefixing endpoint with bucket name when set false, true by default. This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise.html (default true)
|
||||
-tls
|
||||
Whether to enable TLS for incoming HTTP requests at -httpListenAddr (aka https). -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
@@ -1342,8 +1407,11 @@ For example:
|
||||
```yaml
|
||||
static_configs:
|
||||
- targets:
|
||||
# support using full url
|
||||
- 'http://alertmanager:9093/test/api/v2/alerts'
|
||||
- 'https://alertmanager:9093/api/v2/alerts'
|
||||
# the following target with only host:port will be used as <scheme>://localhost:9093/<path_prefix>/api/v2/alerts
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
@@ -1468,7 +1536,7 @@ software. Please keep simplicity as the main priority.
|
||||
## How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
|
||||
|
||||
* `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
@@ -1494,7 +1562,7 @@ spec:
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make vmalert` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
@@ -1510,7 +1578,7 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.19.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.20.
|
||||
1. Run `make vmalert-linux-arm` or `make vmalert-linux-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert-linux-arm` or `vmalert-linux-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
|
||||
@@ -19,10 +19,14 @@ import (
|
||||
// Group contains list of Rules grouped into
|
||||
// entity with one name and evaluation interval
|
||||
type Group struct {
|
||||
Type Type `yaml:"type,omitempty"`
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval *promutils.Duration `yaml:"interval,omitempty"`
|
||||
Type Type `yaml:"type,omitempty"`
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval *promutils.Duration `yaml:"interval,omitempty"`
|
||||
EvalOffset *promutils.Duration `yaml:"eval_offset,omitempty"`
|
||||
// EvalDelay will adjust the `time` parameter of rule evaluation requests to compensate intentional query delay from datasource.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155
|
||||
EvalDelay *promutils.Duration `yaml:"eval_delay,omitempty"`
|
||||
Limit int `yaml:"limit,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
@@ -38,6 +42,8 @@ type Group struct {
|
||||
Headers []Header `yaml:"headers,omitempty"`
|
||||
// NotifierHeaders contains optional HTTP headers sent to notifiers for generated notifications
|
||||
NotifierHeaders []Header `yaml:"notifier_headers,omitempty"`
|
||||
// EvalAlignment will make the timestamp of group query requests be aligned with interval
|
||||
EvalAlignment *bool `yaml:"eval_alignment,omitempty"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
@@ -63,11 +69,27 @@ func (g *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Validate check for internal Group or Rule configuration errors
|
||||
// Validate checks configuration errors for group and internal rules
|
||||
func (g *Group) Validate(validateTplFn ValidateTplFn, validateExpressions bool) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("group name must be set")
|
||||
}
|
||||
if g.Interval.Duration() < 0 {
|
||||
return fmt.Errorf("interval shouldn't be lower than 0")
|
||||
}
|
||||
if g.EvalOffset.Duration() < 0 {
|
||||
return fmt.Errorf("eval_offset shouldn't be lower than 0")
|
||||
}
|
||||
// if `eval_offset` is set, interval won't use global evaluationInterval flag and must bigger than offset.
|
||||
if g.EvalOffset.Duration() > g.Interval.Duration() {
|
||||
return fmt.Errorf("eval_offset should be smaller than interval; now eval_offset: %v, interval: %v", g.EvalOffset.Duration(), g.Interval.Duration())
|
||||
}
|
||||
if g.Limit < 0 {
|
||||
return fmt.Errorf("invalid limit %d, shouldn't be less than 0", g.Limit)
|
||||
}
|
||||
if g.Concurrency < 0 {
|
||||
return fmt.Errorf("invalid concurrency %d, shouldn't be less than 0", g.Concurrency)
|
||||
}
|
||||
|
||||
uniqueRules := map[uint64]struct{}{}
|
||||
for _, r := range g.Rules {
|
||||
@@ -76,26 +98,26 @@ func (g *Group) Validate(validateTplFn ValidateTplFn, validateExpressions bool)
|
||||
ruleName = r.Alert
|
||||
}
|
||||
if _, ok := uniqueRules[r.ID]; ok {
|
||||
return fmt.Errorf("%q is a duplicate within the group %q", r.String(), g.Name)
|
||||
return fmt.Errorf("%q is a duplicate in group", r.String())
|
||||
}
|
||||
uniqueRules[r.ID] = struct{}{}
|
||||
if err := r.Validate(); err != nil {
|
||||
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
|
||||
return fmt.Errorf("invalid rule %q: %w", ruleName, err)
|
||||
}
|
||||
if validateExpressions {
|
||||
// its needed only for tests.
|
||||
// because correct types must be inherited after unmarshalling.
|
||||
exprValidator := g.Type.ValidateExpr
|
||||
if err := exprValidator(r.Expr); err != nil {
|
||||
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
return fmt.Errorf("invalid expression for rule %q: %w", ruleName, err)
|
||||
}
|
||||
}
|
||||
if validateTplFn != nil {
|
||||
if err := validateTplFn(r.Annotations); err != nil {
|
||||
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
return fmt.Errorf("invalid annotations for rule %q: %w", ruleName, err)
|
||||
}
|
||||
if err := validateTplFn(r.Labels); err != nil {
|
||||
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
|
||||
return fmt.Errorf("invalid labels for rule %q: %w", ruleName, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,7 +236,7 @@ func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExp
|
||||
|
||||
files, err := readFromFS(pathPatterns)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read from the config: %s", err)
|
||||
return nil, fmt.Errorf("failed to read from the config: %w", err)
|
||||
}
|
||||
return parse(files, validateTplFn, validateExpressions)
|
||||
}
|
||||
@@ -223,11 +245,11 @@ func ParseSilent(pathPatterns []string, validateTplFn ValidateTplFn, validateExp
|
||||
func Parse(pathPatterns []string, validateTplFn ValidateTplFn, validateExpressions bool) ([]Group, error) {
|
||||
files, err := readFromFS(pathPatterns)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read from the config: %s", err)
|
||||
return nil, fmt.Errorf("failed to read from the config: %w", err)
|
||||
}
|
||||
groups, err := parse(files, validateTplFn, validateExpressions)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse %s: %s", pathPatterns, err)
|
||||
return nil, fmt.Errorf("failed to parse %s: %w", pathPatterns, err)
|
||||
}
|
||||
if len(groups) < 1 {
|
||||
cLogger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))
|
||||
|
||||
@@ -68,6 +68,10 @@ func TestParseBad(t *testing.T) {
|
||||
path []string
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
[]string{"testdata/rules/rules_interval_bad.rules"},
|
||||
"eval_offset should be smaller than interval",
|
||||
},
|
||||
{
|
||||
[]string{"testdata/rules/rules0-bad.rules"},
|
||||
"unexpected token",
|
||||
@@ -102,7 +106,7 @@ func TestParseBad(t *testing.T) {
|
||||
},
|
||||
{
|
||||
[]string{"http://unreachable-url"},
|
||||
"failed to read",
|
||||
"failed to",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
@@ -141,6 +145,35 @@ func TestGroup_Validate(t *testing.T) {
|
||||
group: &Group{},
|
||||
expErr: "group name must be set",
|
||||
},
|
||||
{
|
||||
group: &Group{
|
||||
Name: "negative interval",
|
||||
Interval: promutils.NewDuration(-1),
|
||||
},
|
||||
expErr: "interval shouldn't be lower than 0",
|
||||
},
|
||||
{
|
||||
group: &Group{
|
||||
Name: "wrong eval_offset",
|
||||
Interval: promutils.NewDuration(time.Minute),
|
||||
EvalOffset: promutils.NewDuration(2 * time.Minute),
|
||||
},
|
||||
expErr: "eval_offset should be smaller than interval",
|
||||
},
|
||||
{
|
||||
group: &Group{
|
||||
Name: "wrong limit",
|
||||
Limit: -1,
|
||||
},
|
||||
expErr: "invalid limit",
|
||||
},
|
||||
{
|
||||
group: &Group{
|
||||
Name: "wrong concurrency",
|
||||
Concurrency: -1,
|
||||
},
|
||||
expErr: "invalid concurrency",
|
||||
},
|
||||
{
|
||||
group: &Group{
|
||||
Name: "test",
|
||||
|
||||
@@ -49,7 +49,7 @@ func (fs *FS) Read(files []string) (map[string][]byte, error) {
|
||||
path, resp.StatusCode, http.StatusOK, data)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot read %q: %s", path, err)
|
||||
return nil, fmt.Errorf("cannot read %q: %w", path, err)
|
||||
}
|
||||
result[path] = data
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ groups:
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
type: prometheus
|
||||
eval_delay: 30s
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by (instance) > 1
|
||||
|
||||
13
app/vmalert/config/testdata/rules/rules_interval_bad.rules
vendored
Normal file
13
app/vmalert/config/testdata/rules/rules_interval_bad.rules
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
groups:
|
||||
- name: groupTest
|
||||
## default interval is 1min, eval_offset shouldn't be greater than interval
|
||||
eval_offset: 2m
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 2s
|
||||
expr: sum(rate(vm_http_request_errors_total[2s])) > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
131
app/vmalert/datasource/faker.go
Normal file
131
app/vmalert/datasource/faker.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FakeQuerier is a mock querier that return predefined results and error message
|
||||
type FakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []Metric
|
||||
err error
|
||||
}
|
||||
|
||||
// SetErr sets query error message
|
||||
func (fq *FakeQuerier) SetErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// Reset reset querier's error message and results
|
||||
func (fq *FakeQuerier) Reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// Add appends metrics to querier result metrics
|
||||
func (fq *FakeQuerier) Add(metrics ...Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// BuildWithParams return FakeQuerier itself
|
||||
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fq
|
||||
}
|
||||
|
||||
// QueryRange performs query
|
||||
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
|
||||
req, _, err := fq.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
// Query returns metrics restored in querier
|
||||
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return Result{}, nil, fq.err
|
||||
}
|
||||
cp := make([]Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
return Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
// FakeQuerierWithRegistry can store different results for different query expr
|
||||
type FakeQuerierWithRegistry struct {
|
||||
sync.Mutex
|
||||
registry map[string][]Metric
|
||||
}
|
||||
|
||||
// Set stores query result for given key
|
||||
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
|
||||
fqr.Lock()
|
||||
if fqr.registry == nil {
|
||||
fqr.registry = make(map[string][]Metric)
|
||||
}
|
||||
fqr.registry[key] = metrics
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
// Reset clean querier's results registry
|
||||
func (fqr *FakeQuerierWithRegistry) Reset() {
|
||||
fqr.Lock()
|
||||
fqr.registry = nil
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
// BuildWithParams returns itself
|
||||
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fqr
|
||||
}
|
||||
|
||||
// QueryRange performs query
|
||||
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
|
||||
req, _, err := fqr.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
// Query returns metrics restored in querier registry
|
||||
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
|
||||
fqr.Lock()
|
||||
defer fqr.Unlock()
|
||||
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
metrics, ok := fqr.registry[expr]
|
||||
if !ok {
|
||||
return Result{}, req, nil
|
||||
}
|
||||
cp := make([]Metric, len(metrics))
|
||||
copy(cp, metrics)
|
||||
return Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
// FakeQuerierWithDelay mock querier with given delay duration
|
||||
type FakeQuerierWithDelay struct {
|
||||
FakeQuerier
|
||||
Delay time.Duration
|
||||
}
|
||||
|
||||
// Query returns query result after delay duration
|
||||
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
|
||||
timer := time.NewTimer(fqd.Delay)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-timer.C:
|
||||
}
|
||||
return fqd.FakeQuerier.Query(ctx, expr, ts)
|
||||
}
|
||||
|
||||
// BuildWithParams returns itself
|
||||
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fqd
|
||||
}
|
||||
@@ -10,13 +10,14 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "Datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect URL. Required parameter. "+
|
||||
"E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend and -datasource.showURL")
|
||||
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
|
||||
showDatasourceURL = flag.Bool("datasource.showURL", false, "Whether to show -datasource.url in the exported metrics. "+
|
||||
showDatasourceURL = flag.Bool("datasource.showURL", false, "Whether to avoid stripping sensitive information such as auth headers or passwords from URLs in log messages or UI and exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensitive info such as auth key")
|
||||
|
||||
headers = flag.String("datasource.headers", "", "Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. "+
|
||||
@@ -42,12 +43,16 @@ var (
|
||||
oauth2TokenURL = flag.String("datasource.oauth2.tokenUrl", "", "Optional OAuth2 tokenURL to use for -datasource.url.")
|
||||
oauth2Scopes = flag.String("datasource.oauth2.scopes", "", "Optional OAuth2 scopes to use for -datasource.url. Scopes must be delimited by ';'")
|
||||
|
||||
lookBack = flag.Duration("datasource.lookback", 0, `Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
|
||||
lookBack = flag.Duration("datasource.lookback", 0, `Will be deprecated soon, please adjust "-search.latencyOffset" at datasource side `+
|
||||
`or specify "latency_offset" in rule group's params. Lookback defines how far into the past to look when evaluating queries. `+
|
||||
`For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
|
||||
queryStep = flag.Duration("datasource.queryStep", 5*time.Minute, "How far a value can fallback to when evaluating queries. "+
|
||||
"For example, if -datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query. "+
|
||||
"If set to 0, rule's evaluation interval will be used instead.")
|
||||
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Whether to align "time" parameter with evaluation interval.`+
|
||||
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. See more details here https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
|
||||
queryTimeAlignment = flag.Bool("datasource.queryTimeAlignment", true, `Deprecated: please use "eval_alignment" in rule group instead. `+
|
||||
`Whether to align "time" parameter with evaluation interval. `+
|
||||
"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. "+
|
||||
"See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
|
||||
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
|
||||
disableKeepAlive = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
|
||||
`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)
|
||||
@@ -62,6 +67,11 @@ func InitSecretFlags() {
|
||||
}
|
||||
}
|
||||
|
||||
// ShowDatasourceURL whether to show -datasource.url with sensitive information
|
||||
func ShowDatasourceURL() bool {
|
||||
return *showDatasourceURL
|
||||
}
|
||||
|
||||
// Param represents an HTTP GET param
|
||||
type Param struct {
|
||||
Key, Value string
|
||||
@@ -74,6 +84,12 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
|
||||
if *addr == "" {
|
||||
return nil, fmt.Errorf("datasource.url is empty")
|
||||
}
|
||||
if !*queryTimeAlignment {
|
||||
logger.Warnf("flag `-datasource.queryTimeAlignment` is deprecated and will be removed in next releases. Please use `eval_alignment` in rule group instead.")
|
||||
}
|
||||
if *lookBack != 0 {
|
||||
logger.Warnf("flag `-datasource.lookback` will be deprecated soon. Please use `-rule.evalDelay` command-line flag instead. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155 for details.")
|
||||
}
|
||||
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
@@ -100,6 +116,10 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
_, err = authCfg.GetAuthHeader()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to set request auth header to datasource %q: %w", *addr, err)
|
||||
}
|
||||
|
||||
return &VMStorage{
|
||||
c: &http.Client{Transport: tr},
|
||||
|
||||
@@ -37,11 +37,14 @@ type VMStorage struct {
|
||||
appendTypePrefix bool
|
||||
lookBack time.Duration
|
||||
queryStep time.Duration
|
||||
dataSourceType datasourceType
|
||||
|
||||
dataSourceType datasourceType
|
||||
// evaluationInterval will help setting request's `step` param.
|
||||
evaluationInterval time.Duration
|
||||
extraParams url.Values
|
||||
extraHeaders []keyValue
|
||||
// extraParams contains params to be attached to each HTTP request
|
||||
extraParams url.Values
|
||||
// extraHeaders are headers to be attached to each HTTP request
|
||||
extraHeaders []keyValue
|
||||
|
||||
// whether to print additional log messages
|
||||
// for each sent request
|
||||
@@ -91,8 +94,15 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
|
||||
s.extraParams = url.Values{}
|
||||
}
|
||||
for k, vl := range params.QueryParams {
|
||||
for _, v := range vl { // custom query params are prior to default ones
|
||||
s.extraParams.Set(k, v)
|
||||
// custom query params are prior to default ones
|
||||
if s.extraParams.Has(k) {
|
||||
s.extraParams.Del(k)
|
||||
}
|
||||
for _, v := range vl {
|
||||
// don't use .Set() instead of Del/Add since it is allowed
|
||||
// for GET params to be duplicated
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4908
|
||||
s.extraParams.Add(k, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -127,33 +137,35 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati
|
||||
|
||||
// Query executes the given query and returns parsed response
|
||||
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error) {
|
||||
req, err := s.newRequestPOST()
|
||||
req, err := s.newQueryRequest(query, ts)
|
||||
if err != nil {
|
||||
return Result{}, nil, err
|
||||
}
|
||||
|
||||
switch s.dataSourceType {
|
||||
case "", datasourcePrometheus:
|
||||
s.setPrometheusInstantReqParams(req, query, ts)
|
||||
case datasourceGraphite:
|
||||
s.setGraphiteReqParams(req, query, ts)
|
||||
default:
|
||||
return Result{}, nil, fmt.Errorf("engine not found: %q", s.dataSourceType)
|
||||
}
|
||||
|
||||
resp, err := s.do(ctx, req)
|
||||
if err != nil {
|
||||
return Result{}, req, err
|
||||
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
// Return unexpected error to the caller.
|
||||
return Result{}, nil, err
|
||||
}
|
||||
// Something in the middle between client and datasource might be closing
|
||||
// the connection. So we do a one more attempt in hope request will succeed.
|
||||
req, err = s.newQueryRequest(query, ts)
|
||||
if err != nil {
|
||||
return Result{}, nil, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
resp, err = s.do(ctx, req)
|
||||
if err != nil {
|
||||
return Result{}, nil, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
}
|
||||
defer func() {
|
||||
_ = resp.Body.Close()
|
||||
}()
|
||||
|
||||
// Process the received response.
|
||||
parseFn := parsePrometheusResponse
|
||||
if s.dataSourceType != datasourcePrometheus {
|
||||
parseFn = parseGraphiteResponse
|
||||
}
|
||||
result, err := parseFn(req, resp)
|
||||
_ = resp.Body.Close()
|
||||
return result, req, err
|
||||
}
|
||||
|
||||
@@ -164,56 +176,96 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
|
||||
if s.dataSourceType != datasourcePrometheus {
|
||||
return res, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType)
|
||||
}
|
||||
req, err := s.newRequestPOST()
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
if start.IsZero() {
|
||||
return res, fmt.Errorf("start param is missing")
|
||||
}
|
||||
if end.IsZero() {
|
||||
return res, fmt.Errorf("end param is missing")
|
||||
}
|
||||
s.setPrometheusRangeReqParams(req, query, start, end)
|
||||
resp, err := s.do(ctx, req)
|
||||
req, err := s.newQueryRangeRequest(query, start, end)
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
defer func() {
|
||||
_ = resp.Body.Close()
|
||||
}()
|
||||
return parsePrometheusResponse(req, resp)
|
||||
resp, err := s.do(ctx, req)
|
||||
if err != nil {
|
||||
if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
// Return unexpected error to the caller.
|
||||
return res, err
|
||||
}
|
||||
// Something in the middle between client and datasource might be closing
|
||||
// the connection. So we do a one more attempt in hope request will succeed.
|
||||
req, err = s.newQueryRangeRequest(query, start, end)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
resp, err = s.do(ctx, req)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("second attempt: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Process the received response.
|
||||
res, err = parsePrometheusResponse(req, resp)
|
||||
_ = resp.Body.Close()
|
||||
return res, err
|
||||
}
|
||||
|
||||
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
|
||||
ru := req.URL.Redacted()
|
||||
if *showDatasourceURL {
|
||||
ru = req.URL.String()
|
||||
}
|
||||
if s.debug {
|
||||
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery)
|
||||
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, ru)
|
||||
}
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
// something in the middle between client and datasource might be closing
|
||||
// the connection. So we do a one more attempt in hope request will succeed.
|
||||
resp, err = s.c.Do(req.WithContext(ctx))
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", ru, err)
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL.Redacted(), body)
|
||||
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, ru, body)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (s *VMStorage) newRequestPOST() (*http.Request, error) {
|
||||
func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) (*http.Request, error) {
|
||||
req, err := s.newRequest()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot create query_range request to datasource %q: %w", s.datasourceURL, err)
|
||||
}
|
||||
s.setPrometheusRangeReqParams(req, query, start, end)
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func (s *VMStorage) newQueryRequest(query string, ts time.Time) (*http.Request, error) {
|
||||
req, err := s.newRequest()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot create query request to datasource %q: %w", s.datasourceURL, err)
|
||||
}
|
||||
switch s.dataSourceType {
|
||||
case "", datasourcePrometheus:
|
||||
s.setPrometheusInstantReqParams(req, query, ts)
|
||||
case datasourceGraphite:
|
||||
s.setGraphiteReqParams(req, query, ts)
|
||||
default:
|
||||
logger.Panicf("BUG: engine not found: %q", s.dataSourceType)
|
||||
}
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func (s *VMStorage) newRequest() (*http.Request, error) {
|
||||
req, err := http.NewRequest(http.MethodPost, s.datasourceURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", s.datasourceURL, err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
if s.authCfg != nil {
|
||||
s.authCfg.SetHeaders(req, true)
|
||||
err = s.authCfg.SetHeaders(req, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
for _, h := range s.extraHeaders {
|
||||
req.Header.Set(h.key, h.value)
|
||||
|
||||
@@ -112,14 +112,14 @@ func parsePrometheusResponse(req *http.Request, resp *http.Response) (res Result
|
||||
return res, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL.Redacted(), r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return res, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
return res, fmt.Errorf("unknown status: %s, Expected success or error", r.Status)
|
||||
}
|
||||
var parseFn func() ([]Metric, error)
|
||||
switch r.Data.ResultType {
|
||||
case rtVector:
|
||||
var pi promInstant
|
||||
if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
|
||||
return res, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
|
||||
return res, fmt.Errorf("unmarshal err %w; \n %#v", err, string(r.Data.Result))
|
||||
}
|
||||
parseFn = pi.metrics
|
||||
case rtMatrix:
|
||||
@@ -164,10 +164,6 @@ func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string,
|
||||
if s.lookBack > 0 {
|
||||
timestamp = timestamp.Add(-s.lookBack)
|
||||
}
|
||||
if *queryTimeAlignment && s.evaluationInterval > 0 {
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
|
||||
timestamp = timestamp.Truncate(s.evaluationInterval)
|
||||
}
|
||||
q.Set("time", timestamp.Format(time.RFC3339))
|
||||
if !*disableStepParam && s.evaluationInterval > 0 { // set step as evaluationInterval by default
|
||||
// always convert to seconds to keep compatibility with older
|
||||
|
||||
@@ -506,8 +506,7 @@ func TestRequestParams(t *testing.T) {
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
evalInterval := 15 * time.Second
|
||||
tt := timestamp.Truncate(evalInterval)
|
||||
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {tt.Format(time.RFC3339)}}
|
||||
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {timestamp.Format(time.RFC3339)}}
|
||||
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
@@ -521,7 +520,6 @@ func TestRequestParams(t *testing.T) {
|
||||
func(t *testing.T, r *http.Request) {
|
||||
evalInterval := 15 * time.Second
|
||||
tt := timestamp.Add(-time.Minute)
|
||||
tt = tt.Truncate(evalInterval)
|
||||
exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {tt.Format(time.RFC3339)}}
|
||||
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
|
||||
},
|
||||
@@ -549,8 +547,7 @@ func TestRequestParams(t *testing.T) {
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
evalInterval := 3 * time.Hour
|
||||
tt := timestamp.Truncate(evalInterval)
|
||||
exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {tt.Format(time.RFC3339)}}
|
||||
exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {timestamp.Format(time.RFC3339)}}
|
||||
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
@@ -596,6 +593,17 @@ func TestRequestParams(t *testing.T) {
|
||||
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"allow duplicates in query params",
|
||||
false,
|
||||
storage.Clone().ApplyParams(QuerierParams{
|
||||
QueryParams: url.Values{"extra_labels": {"env=dev", "foo=bar"}},
|
||||
}),
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := url.Values{"query": {query}, "round_digits": {"10"}, "extra_labels": {"env=dev", "foo=bar"}, "time": {timestamp.Format(time.RFC3339)}}
|
||||
checkEqualString(t, exp.Encode(), r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"graphite extra params",
|
||||
false,
|
||||
@@ -629,9 +637,9 @@ func TestRequestParams(t *testing.T) {
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
req, err := tc.vm.newRequestPOST()
|
||||
req, err := tc.vm.newRequest()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
t.Fatal(err)
|
||||
}
|
||||
switch tc.vm.dataSourceType {
|
||||
case "", datasourcePrometheus:
|
||||
@@ -727,9 +735,9 @@ func TestHeaders(t *testing.T) {
|
||||
for _, tt := range testCases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
vm := tt.vmFn()
|
||||
req, err := vm.newRequestPOST()
|
||||
req, err := vm.newQueryRequest("foo", time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
t.Fatal(err)
|
||||
}
|
||||
tt.checkFn(t, req)
|
||||
})
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
@@ -46,8 +47,8 @@ all files with prefix rule_ in folder dir.
|
||||
See https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage
|
||||
`)
|
||||
|
||||
ruleTemplatesPath = flagutil.NewArrayString("rule.templates", `Path or glob pattern to location with go template definitions
|
||||
for rules annotations templating. Flag can be specified multiple times.
|
||||
ruleTemplatesPath = flagutil.NewArrayString("rule.templates", `Path or glob pattern to location with go template definitions `+
|
||||
`for rules annotations templating. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule.templates="/path/to/file". Path to a single file with go templates
|
||||
-rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder,
|
||||
@@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
|
||||
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
|
||||
"which by default is 4 times evaluationInterval of the parent group.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
|
||||
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
|
||||
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
|
||||
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
|
||||
@@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
|
||||
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
|
||||
"Pass multiple -label flags in order to add multiple label sets.")
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
|
||||
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
|
||||
)
|
||||
|
||||
@@ -101,6 +93,7 @@ func main() {
|
||||
remoteread.InitSecretFlags()
|
||||
remotewrite.InitSecretFlags()
|
||||
datasource.InitSecretFlags()
|
||||
notifier.InitSecretFlags()
|
||||
buildinfo.Init()
|
||||
logger.Init()
|
||||
pushmetrics.Init()
|
||||
@@ -228,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: q,
|
||||
notifiers: nts,
|
||||
labels: labels,
|
||||
@@ -237,7 +230,9 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
|
||||
}
|
||||
manager.rw = rw
|
||||
if rw != nil {
|
||||
manager.rw = rw
|
||||
}
|
||||
|
||||
rr, err := remoteread.Init()
|
||||
if err != nil {
|
||||
|
||||
@@ -8,11 +8,19 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// Disable rand sleep on group start during tests in order to speed up test execution.
|
||||
// Rand sleep is needed only in prod code.
|
||||
rule.SkipRandSleepOnGroupStart = true
|
||||
}
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
expURL := "https://vicotriametrics.com/path"
|
||||
u, err := getExternalURL(expURL, "", false)
|
||||
@@ -87,10 +95,10 @@ groups:
|
||||
)
|
||||
|
||||
f, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() { _ = os.Remove(f.Name()) }()
|
||||
writeToFile(t, f.Name(), rules1)
|
||||
|
||||
*configCheckInterval = 200 * time.Millisecond
|
||||
@@ -98,10 +106,10 @@ groups:
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
m := &manager{
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
labels: map[string]string{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
rw: &remotewrite.Client{},
|
||||
}
|
||||
|
||||
|
||||
@@ -3,14 +3,13 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
@@ -19,7 +18,7 @@ type manager struct {
|
||||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers func() []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rw remotewrite.RWClient
|
||||
// remote read builder.
|
||||
rr datasource.QuerierBuilder
|
||||
|
||||
@@ -27,28 +26,28 @@ type manager struct {
|
||||
labels map[string]string
|
||||
|
||||
groupsMu sync.RWMutex
|
||||
groups map[uint64]*Group
|
||||
groups map[uint64]*rule.Group
|
||||
}
|
||||
|
||||
// RuleAPI generates APIRule object from alert by its ID(hash)
|
||||
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
|
||||
// ruleAPI generates apiRule object from alert by its ID(hash)
|
||||
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
|
||||
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if rule.ID() == rID {
|
||||
return rule.ToAPI(), nil
|
||||
return ruleToAPI(rule), nil
|
||||
}
|
||||
}
|
||||
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
||||
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
// alertAPI generates apiAlert object from alert by its ID(hash)
|
||||
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
@@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
for _, r := range g.Rules {
|
||||
ar, ok := r.(*rule.AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
@@ -82,15 +81,15 @@ func (m *manager) close() {
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error {
|
||||
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
|
||||
m.wg.Add(1)
|
||||
id := g.ID()
|
||||
go func() {
|
||||
defer m.wg.Done()
|
||||
if restore {
|
||||
g.start(ctx, m.notifiers, m.rw, m.rr)
|
||||
g.Start(ctx, m.notifiers, m.rw, m.rr)
|
||||
} else {
|
||||
g.start(ctx, m.notifiers, m.rw, nil)
|
||||
g.Start(ctx, m.notifiers, m.rw, nil)
|
||||
}
|
||||
}()
|
||||
m.groups[id] = g
|
||||
@@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
|
||||
|
||||
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
||||
var rrPresent, arPresent bool
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
groupsRegistry := make(map[uint64]*rule.Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
for _, r := range cfg.Rules {
|
||||
if rrPresent && arPresent {
|
||||
@@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
arPresent = true
|
||||
}
|
||||
}
|
||||
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
||||
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
@@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
}
|
||||
|
||||
type updateItem struct {
|
||||
old *Group
|
||||
new *Group
|
||||
old *rule.Group
|
||||
new *rule.Group
|
||||
}
|
||||
var toUpdate []updateItem
|
||||
|
||||
@@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
if !ok {
|
||||
// old group is not present in new list,
|
||||
// so must be stopped and deleted
|
||||
og.close()
|
||||
og.Close()
|
||||
delete(m.groups, og.ID())
|
||||
og = nil
|
||||
continue
|
||||
@@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
var wg sync.WaitGroup
|
||||
for _, item := range toUpdate {
|
||||
wg.Add(1)
|
||||
go func(old *Group, new *Group) {
|
||||
old.updateCh <- new
|
||||
go func(old *rule.Group, new *rule.Group) {
|
||||
old.UpdateWith(new)
|
||||
wg.Done()
|
||||
}(item.old, item.new)
|
||||
item.old.interruptEval()
|
||||
item.old.InterruptEval()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
|
||||
ag := APIGroup{
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.Seconds(),
|
||||
LastEvaluation: g.LastEvaluation,
|
||||
Concurrency: g.Concurrency,
|
||||
Params: urlValuesToStrings(g.Params),
|
||||
Headers: headersToStrings(g.Headers),
|
||||
NotifierHeaders: headersToStrings(g.NotifierHeaders),
|
||||
|
||||
Labels: g.Labels,
|
||||
}
|
||||
ag.Rules = make([]APIRule, 0)
|
||||
for _, r := range g.Rules {
|
||||
ag.Rules = append(ag.Rules, r.ToAPI())
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
func urlValuesToStrings(values url.Values) []string {
|
||||
if len(values) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(values))
|
||||
for k := range values {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
params := values[k]
|
||||
for _, v := range params {
|
||||
res = append(res, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func headersToStrings(headers map[string]string) []string {
|
||||
if len(headers) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(headers))
|
||||
for k := range headers {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
v := headers[k]
|
||||
res = append(res, fmt.Sprintf("%s: %s", k, v))
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
@@ -10,8 +10,10 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
)
|
||||
|
||||
@@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
|
||||
// successful cases of
|
||||
// starting with empty rules folder
|
||||
func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m := &manager{groups: make(map[uint64]*rule.Group)}
|
||||
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
|
||||
if err := m.update(context.Background(), cfg, false); err != nil {
|
||||
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
|
||||
@@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
@@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
VMRows = &rule.AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
@@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Conns = &rule.AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
@@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
|
||||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
ExampleAlertAlwaysFiring = &rule.AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
@@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
|
||||
name string
|
||||
initPath string
|
||||
updatePath string
|
||||
want []*Group
|
||||
want []*rule.Group
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "config/testdata/rules/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Rules: []rule.Rule{
|
||||
&rule.AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
@@ -153,64 +155,68 @@ func TestManagerUpdate(t *testing.T) {
|
||||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "config/testdata/dir/rules/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules/rules0-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Type: config.NewPrometheusType(),
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Name: "TestGroup",
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update with one bad rule file",
|
||||
initPath: "config/testdata/rules/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup",
|
||||
Type: config.NewPrometheusType(),
|
||||
Rules: []Rule{
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "update empty dir rules from 0 to 2 groups",
|
||||
initPath: "config/testdata/empty/*",
|
||||
updatePath: "config/testdata/rules/rules0-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Type: config.NewPrometheusType(),
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Name: "TestGroup",
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
},
|
||||
@@ -222,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
|
||||
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
|
||||
@@ -253,18 +259,44 @@ func TestManagerUpdate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
func compareGroups(t *testing.T, a, b *rule.Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := rule.CompareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparison error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestManagerUpdateNegative(t *testing.T) {
|
||||
testCases := []struct {
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
rw remotewrite.RWClient
|
||||
cfg config.Group
|
||||
expErr string
|
||||
}{
|
||||
{
|
||||
nil,
|
||||
nil,
|
||||
config.Group{Name: "Recording rule only",
|
||||
config.Group{
|
||||
Name: "Recording rule only",
|
||||
Rules: []config.Rule{
|
||||
{Record: "record", Expr: "max(up)"},
|
||||
},
|
||||
@@ -274,7 +306,8 @@ func TestManagerUpdateNegative(t *testing.T) {
|
||||
{
|
||||
nil,
|
||||
nil,
|
||||
config.Group{Name: "Alerting rule only",
|
||||
config.Group{
|
||||
Name: "Alerting rule only",
|
||||
Rules: []config.Rule{
|
||||
{Alert: "alert", Expr: "up > 0"},
|
||||
},
|
||||
@@ -282,9 +315,10 @@ func TestManagerUpdateNegative(t *testing.T) {
|
||||
"contains alerting rules",
|
||||
},
|
||||
{
|
||||
[]notifier.Notifier{&fakeNotifier{}},
|
||||
[]notifier.Notifier{¬ifier.FakeNotifier{}},
|
||||
nil,
|
||||
config.Group{Name: "Recording and alerting rules",
|
||||
config.Group{
|
||||
Name: "Recording and alerting rules",
|
||||
Rules: []config.Rule{
|
||||
{Alert: "alert1", Expr: "up > 0"},
|
||||
{Alert: "alert2", Expr: "up > 0"},
|
||||
@@ -296,7 +330,8 @@ func TestManagerUpdateNegative(t *testing.T) {
|
||||
{
|
||||
nil,
|
||||
&remotewrite.Client{},
|
||||
config.Group{Name: "Recording and alerting rules",
|
||||
config.Group{
|
||||
Name: "Recording and alerting rules",
|
||||
Rules: []config.Rule{
|
||||
{Record: "record1", Expr: "max(up)"},
|
||||
{Record: "record2", Expr: "max(up)"},
|
||||
@@ -310,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.cfg.Name, func(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
rw: tc.rw,
|
||||
}
|
||||
if tc.notifiers != nil {
|
||||
@@ -340,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func TestUrlValuesToStrings(t *testing.T) {
|
||||
mapQueryParams := map[string][]string{
|
||||
"param1": {"param1"},
|
||||
"param2": {"anotherparam"},
|
||||
}
|
||||
expectedRes := []string{"param1=param1", "param2=anotherparam"}
|
||||
res := urlValuesToStrings(mapQueryParams)
|
||||
|
||||
if len(res) != len(expectedRes) {
|
||||
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
|
||||
}
|
||||
for ind, val := range expectedRes {
|
||||
if val != res[ind] {
|
||||
t.Errorf("Expected %v; but got %v", val, res[ind])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -17,7 +18,7 @@ import (
|
||||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
addr string
|
||||
addr *url.URL
|
||||
argFunc AlertURLGenerator
|
||||
client *http.Client
|
||||
timeout time.Duration
|
||||
@@ -48,7 +49,12 @@ func (am *AlertManager) Close() {
|
||||
}
|
||||
|
||||
// Addr returns address where alerts are sent.
|
||||
func (am AlertManager) Addr() string { return am.addr }
|
||||
func (am AlertManager) Addr() string {
|
||||
if *showNotifierURL {
|
||||
return am.addr.String()
|
||||
}
|
||||
return am.addr.Redacted()
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert, headers map[string]string) error {
|
||||
@@ -64,7 +70,7 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
|
||||
b := &bytes.Buffer{}
|
||||
writeamRequest(b, alerts, am.argFunc, am.relabelConfigs)
|
||||
|
||||
req, err := http.NewRequest(http.MethodPost, am.addr, b)
|
||||
req, err := http.NewRequest(http.MethodPost, am.addr.String(), b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -82,7 +88,10 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
|
||||
req = req.WithContext(ctx)
|
||||
|
||||
if am.authCfg != nil {
|
||||
am.authCfg.SetHeaders(req, true)
|
||||
err = am.authCfg.SetHeaders(req, true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
resp, err := am.client.Do(req)
|
||||
if err != nil {
|
||||
@@ -91,12 +100,16 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
|
||||
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
amURL := am.addr.Redacted()
|
||||
if *showNotifierURL {
|
||||
amURL = am.addr.String()
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read response from %q: %w", am.addr, err)
|
||||
return fmt.Errorf("failed to read response from %q: %w", amURL, err)
|
||||
}
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.addr, string(body))
|
||||
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, amURL, string(body))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -136,8 +149,15 @@ func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, authCfg proma
|
||||
return nil, fmt.Errorf("failed to configure auth: %w", err)
|
||||
}
|
||||
|
||||
amURL, err := url.Parse(alertManagerURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("provided incorrect notifier url: %w", err)
|
||||
}
|
||||
if !*showNotifierURL {
|
||||
alertManagerURL = amURL.Redacted()
|
||||
}
|
||||
return &AlertManager{
|
||||
addr: alertManagerURL,
|
||||
addr: amURL,
|
||||
argFunc: fn,
|
||||
authCfg: aCfg,
|
||||
relabelConfigs: relabelCfg,
|
||||
|
||||
@@ -3,7 +3,6 @@ package notifier
|
||||
import (
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"gopkg.in/yaml.v2"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
@@ -11,6 +10,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul"
|
||||
@@ -142,26 +143,23 @@ func parseLabels(target string, metaLabels *promutils.Labels, cfg *Config) (stri
|
||||
if labels.Len() == 0 {
|
||||
return "", nil, nil
|
||||
}
|
||||
schemeRelabeled := labels.Get("__scheme__")
|
||||
if len(schemeRelabeled) == 0 {
|
||||
schemeRelabeled = "http"
|
||||
scheme := labels.Get("__scheme__")
|
||||
if len(scheme) == 0 {
|
||||
scheme = "http"
|
||||
}
|
||||
addressRelabeled := labels.Get("__address__")
|
||||
if len(addressRelabeled) == 0 {
|
||||
alertsPath := labels.Get("__alerts_path__")
|
||||
if !strings.HasPrefix(alertsPath, "/") {
|
||||
alertsPath = "/" + alertsPath
|
||||
}
|
||||
address := labels.Get("__address__")
|
||||
if len(address) == 0 {
|
||||
return "", nil, nil
|
||||
}
|
||||
if strings.Contains(addressRelabeled, "/") {
|
||||
return "", nil, nil
|
||||
}
|
||||
addressRelabeled = addMissingPort(schemeRelabeled, addressRelabeled)
|
||||
alertsPathRelabeled := labels.Get("__alerts_path__")
|
||||
if !strings.HasPrefix(alertsPathRelabeled, "/") {
|
||||
alertsPathRelabeled = "/" + alertsPathRelabeled
|
||||
}
|
||||
u := fmt.Sprintf("%s://%s%s", schemeRelabeled, addressRelabeled, alertsPathRelabeled)
|
||||
address = addMissingPort(scheme, address)
|
||||
u := fmt.Sprintf("%s://%s%s", scheme, address, alertsPath)
|
||||
if _, err := url.Parse(u); err != nil {
|
||||
return "", nil, fmt.Errorf("invalid url %q for scheme=%q (%q), target=%q, metrics_path=%q (%q): %w",
|
||||
u, cfg.Scheme, schemeRelabeled, target, addressRelabeled, alertsPathRelabeled, err)
|
||||
u, cfg.Scheme, scheme, target, address, alertsPath, err)
|
||||
}
|
||||
return u, labels, nil
|
||||
}
|
||||
@@ -181,9 +179,24 @@ func addMissingPort(scheme, target string) string {
|
||||
func mergeLabels(target string, metaLabels *promutils.Labels, cfg *Config) *promutils.Labels {
|
||||
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
|
||||
m := promutils.NewLabels(3 + metaLabels.Len())
|
||||
m.Add("__address__", target)
|
||||
m.Add("__scheme__", cfg.Scheme)
|
||||
m.Add("__alerts_path__", path.Join("/", cfg.PathPrefix, alertManagerPath))
|
||||
address := target
|
||||
scheme := cfg.Scheme
|
||||
alertsPath := path.Join("/", cfg.PathPrefix, alertManagerPath)
|
||||
// try to extract optional scheme and alertsPath from __address__.
|
||||
if strings.HasPrefix(address, "http://") {
|
||||
scheme = "http"
|
||||
address = address[len("http://"):]
|
||||
} else if strings.HasPrefix(address, "https://") {
|
||||
scheme = "https"
|
||||
address = address[len("https://"):]
|
||||
}
|
||||
if n := strings.IndexByte(address, '/'); n >= 0 {
|
||||
alertsPath = address[n:]
|
||||
address = address[:n]
|
||||
}
|
||||
m.Add("__address__", address)
|
||||
m.Add("__scheme__", scheme)
|
||||
m.Add("__alerts_path__", alertsPath)
|
||||
m.AddFrom(metaLabels)
|
||||
return m
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ func (cw *configWatcher) reload(path string) error {
|
||||
func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn getLabels) error {
|
||||
targets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
|
||||
for _, err := range errors {
|
||||
return fmt.Errorf("failed to init notifier for %q: %s", typeK, err)
|
||||
return fmt.Errorf("failed to init notifier for %q: %w", typeK, err)
|
||||
}
|
||||
|
||||
cw.setTargets(typeK, targets)
|
||||
@@ -107,7 +107,7 @@ func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn
|
||||
}
|
||||
updateTargets, errors := targetsFromLabels(labelsFn, cw.cfg, cw.genFn)
|
||||
for _, err := range errors {
|
||||
logger.Errorf("failed to init notifier for %q: %s", typeK, err)
|
||||
logger.Errorf("failed to init notifier for %q: %w", typeK, err)
|
||||
}
|
||||
cw.setTargets(typeK, updateTargets)
|
||||
}
|
||||
@@ -118,7 +118,7 @@ func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn
|
||||
func targetsFromLabels(labelsFn getLabels, cfg *Config, genFn AlertURLGenerator) ([]Target, []error) {
|
||||
metaLabels, err := labelsFn()
|
||||
if err != nil {
|
||||
return nil, []error{fmt.Errorf("failed to get labels: %s", err)}
|
||||
return nil, []error{fmt.Errorf("failed to get labels: %w", err)}
|
||||
}
|
||||
var targets []Target
|
||||
var errors []error
|
||||
@@ -167,11 +167,11 @@ func (cw *configWatcher) start() error {
|
||||
for _, target := range cfg.Targets {
|
||||
address, labels, err := parseLabels(target, nil, cw.cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse labels for target %q: %s", target, err)
|
||||
return fmt.Errorf("failed to parse labels for target %q: %w", target, err)
|
||||
}
|
||||
notifier, err := NewAlertManager(address, cw.genFn, httpCfg, cw.cfg.parsedAlertRelabelConfigs, cw.cfg.Timeout.Duration())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init alertmanager for addr %q: %s", address, err)
|
||||
return fmt.Errorf("failed to init alertmanager for addr %q: %w", address, err)
|
||||
}
|
||||
targets = append(targets, Target{
|
||||
Notifier: notifier,
|
||||
@@ -189,14 +189,14 @@ func (cw *configWatcher) start() error {
|
||||
sdc := &cw.cfg.ConsulSDConfigs[i]
|
||||
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("got labels err: %s", err)
|
||||
return nil, fmt.Errorf("got labels err: %w", err)
|
||||
}
|
||||
labels = append(labels, targetLabels...)
|
||||
}
|
||||
return labels, nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start consulSD discovery: %s", err)
|
||||
return fmt.Errorf("failed to start consulSD discovery: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,14 +207,14 @@ func (cw *configWatcher) start() error {
|
||||
sdc := &cw.cfg.DNSSDConfigs[i]
|
||||
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("got labels err: %s", err)
|
||||
return nil, fmt.Errorf("got labels err: %w", err)
|
||||
}
|
||||
labels = append(labels, targetLabels...)
|
||||
}
|
||||
return labels, nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start DNSSD discovery: %s", err)
|
||||
return fmt.Errorf("failed to start DNSSD discovery: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -14,7 +14,6 @@ import (
|
||||
|
||||
func TestConfigWatcherReload(t *testing.T) {
|
||||
f, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -37,7 +36,6 @@ static_configs:
|
||||
}
|
||||
|
||||
f2, err := os.CreateTemp("", "")
|
||||
defer os.Remove(f2.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -65,7 +63,6 @@ func TestConfigWatcherStart(t *testing.T) {
|
||||
defer consulSDServer.Close()
|
||||
|
||||
consulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(consulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -112,7 +109,6 @@ func TestConfigWatcherReloadConcurrent(t *testing.T) {
|
||||
defer consulSDServer2.Close()
|
||||
|
||||
consulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(consulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -129,7 +125,6 @@ consul_sd_configs:
|
||||
`, consulSDServer1.URL, consulSDServer2.URL))
|
||||
|
||||
staticAndConsulSDFile, err := os.CreateTemp("", "")
|
||||
defer os.Remove(staticAndConsulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -323,3 +318,47 @@ func TestMergeHTTPClientConfigs(t *testing.T) {
|
||||
t.Fatalf("expected BasicAuth tp be present")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLabels(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
target string
|
||||
cfg *Config
|
||||
expectedAddress string
|
||||
expectedErr bool
|
||||
}{
|
||||
{
|
||||
"invalid address",
|
||||
"invalid:*//url",
|
||||
&Config{},
|
||||
"",
|
||||
true,
|
||||
},
|
||||
{
|
||||
"use some default params",
|
||||
"alertmanager:9093",
|
||||
&Config{PathPrefix: "test"},
|
||||
"http://alertmanager:9093/test/api/v2/alerts",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"use target address",
|
||||
"https://alertmanager:9093/api/v1/alerts",
|
||||
&Config{Scheme: "http", PathPrefix: "test"},
|
||||
"https://alertmanager:9093/api/v1/alerts",
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
address, _, err := parseLabels(tc.target, nil, tc.cfg)
|
||||
if err == nil == tc.expectedErr {
|
||||
t.Fatalf("unexpected error; got %t; want %t", err != nil, tc.expectedErr)
|
||||
}
|
||||
if address != tc.expectedAddress {
|
||||
t.Fatalf("unexpected address; got %q; want %q", address, tc.expectedAddress)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
59
app/vmalert/notifier/faker.go
Normal file
59
app/vmalert/notifier/faker.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FakeNotifier is a mock notifier
|
||||
type FakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []Alert
|
||||
// records number of received alerts in total
|
||||
counter int
|
||||
}
|
||||
|
||||
// Close does nothing
|
||||
func (*FakeNotifier) Close() {}
|
||||
|
||||
// Addr returns ""
|
||||
func (*FakeNotifier) Addr() string { return "" }
|
||||
|
||||
// Send sets alerts and increases counter
|
||||
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetCounter returns received alerts count
|
||||
func (fn *FakeNotifier) GetCounter() int {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.counter
|
||||
}
|
||||
|
||||
// GetAlerts returns stored alerts
|
||||
func (fn *FakeNotifier) GetAlerts() []Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
// FaultyNotifier is a mock notifier that Send() will return failed response
|
||||
type FaultyNotifier struct {
|
||||
FakeNotifier
|
||||
}
|
||||
|
||||
// Send returns failed response
|
||||
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
|
||||
d, ok := ctx.Deadline()
|
||||
if ok {
|
||||
time.Sleep(time.Until(d))
|
||||
}
|
||||
return fmt.Errorf("send failed")
|
||||
}
|
||||
@@ -19,6 +19,8 @@ var (
|
||||
|
||||
addrs = flagutil.NewArrayString("notifier.url", "Prometheus Alertmanager URL, e.g. http://127.0.0.1:9093. "+
|
||||
"List all Alertmanager URLs if it runs in the cluster mode to ensure high availability.")
|
||||
showNotifierURL = flag.Bool("notifier.showURL", false, "Whether to avoid stripping sensitive information such as passwords from URL in log messages or UI for -notifier.url. "+
|
||||
"It is hidden by default, since it can contain sensitive info such as auth key")
|
||||
blackHole = flag.Bool("notifier.blackhole", false, "Whether to blackhole alerting notifications. "+
|
||||
"Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). "+
|
||||
"`-notifier.url`, `-notifier.config` and `-notifier.blackhole` are mutually exclusive.")
|
||||
@@ -88,7 +90,7 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
|
||||
externalLabels = extLabels
|
||||
eu, err := url.Parse(externalURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse external URL: %s", err)
|
||||
return nil, fmt.Errorf("failed to parse external URL: %w", err)
|
||||
}
|
||||
|
||||
templates.UpdateWithFuncs(templates.FuncsWithExternalURL(eu))
|
||||
@@ -114,7 +116,7 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
|
||||
if len(*addrs) > 0 {
|
||||
notifiers, err := notifiersFromFlags(gen)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create notifier from flag values: %s", err)
|
||||
return nil, fmt.Errorf("failed to create notifier from flag values: %w", err)
|
||||
}
|
||||
staticNotifiersFn = func() []Notifier {
|
||||
return notifiers
|
||||
@@ -124,11 +126,18 @@ func Init(gen AlertURLGenerator, extLabels map[string]string, extURL string) (fu
|
||||
|
||||
cw, err = newWatcher(*configPath, gen)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init config watcher: %s", err)
|
||||
return nil, fmt.Errorf("failed to init config watcher: %w", err)
|
||||
}
|
||||
return cw.notifiers, nil
|
||||
}
|
||||
|
||||
// InitSecretFlags must be called after flag.Parse and before any logging
|
||||
func InitSecretFlags() {
|
||||
if !*showNotifierURL {
|
||||
flagutil.RegisterSecretFlag("notifier.url")
|
||||
}
|
||||
}
|
||||
|
||||
func notifiersFromFlags(gen AlertURLGenerator) ([]Notifier, error) {
|
||||
var notifiers []Notifier
|
||||
for i, addr := range *addrs {
|
||||
|
||||
@@ -5,6 +5,7 @@ static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
- https://localhost:9093/test/api/v2/alerts
|
||||
basic_auth:
|
||||
username: foo
|
||||
password: bar
|
||||
|
||||
345
app/vmalert/remotewrite/client.go
Normal file
345
app/vmalert/remotewrite/client.go
Normal file
@@ -0,0 +1,345 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write client.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
rwTotal.Inc()
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
rwErrors.Inc()
|
||||
droppedRows.Add(len(s.Samples))
|
||||
droppedBytes.Add(s.Size())
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
rwErrors.Inc()
|
||||
droppedRows.Add(len(s.Samples))
|
||||
droppedBytes.Add(s.Size())
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
rwErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
rwTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
||||
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer func() {
|
||||
sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
}()
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if errors.Is(err, io.EOF) {
|
||||
// Something in the middle between client and destination might be closing
|
||||
// the connection. So we do a one more attempt in hope request will succeed.
|
||||
err = c.send(ctx, b)
|
||||
}
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
rwErrors.Inc()
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
err = c.authCfg.SetHeaders(req, true)
|
||||
if err != nil {
|
||||
return &nonRetriableError{
|
||||
err: err,
|
||||
}
|
||||
}
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with HTTP 2xx status code when write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{
|
||||
err: fmt.Errorf("unexpected response code %d for %s. Response body %q", resp.StatusCode, req.URL.Redacted(), body),
|
||||
}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
||||
97
app/vmalert/remotewrite/debug_client.go
Normal file
97
app/vmalert/remotewrite/debug_client.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// DebugClient won't push series periodically, but will write data to remote endpoint
|
||||
// immediately when Push() is called
|
||||
type DebugClient struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewDebugClient initiates and returns a new DebugClient
|
||||
func NewDebugClient() (*DebugClient, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &DebugClient{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: t,
|
||||
},
|
||||
addr: strings.TrimSuffix(*addr, "/"),
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push sends the given timeseries to the remote storage.
|
||||
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
|
||||
c.wg.Add(1)
|
||||
defer c.wg.Done()
|
||||
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal the given time series: %w", err)
|
||||
}
|
||||
|
||||
return c.send(data)
|
||||
}
|
||||
|
||||
// Close stops the DebugClient
|
||||
func (c *DebugClient) Close() error {
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *DebugClient) send(data []byte) error {
|
||||
b := snappy.Encode(nil, data)
|
||||
r := bytes.NewReader(b)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode/100 == 2 {
|
||||
return nil
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
50
app/vmalert/remotewrite/debug_client_test.go
Normal file
50
app/vmalert/remotewrite/debug_client_test.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestDebugClient_Push(t *testing.T) {
|
||||
testSrv := newRWServer()
|
||||
oldAddr := *addr
|
||||
*addr = testSrv.URL
|
||||
defer func() {
|
||||
*addr = oldAddr
|
||||
}()
|
||||
|
||||
client, err := NewDebugClient()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create debug client: %s", err)
|
||||
}
|
||||
|
||||
const rowsN = 100
|
||||
var sent int
|
||||
for i := 0; i < rowsN; i++ {
|
||||
s := prompbmarshal.TimeSeries{
|
||||
Samples: []prompbmarshal.Sample{{
|
||||
Value: float64(i),
|
||||
Timestamp: time.Now().Unix(),
|
||||
}},
|
||||
}
|
||||
err := client.Push(s)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
if err == nil {
|
||||
sent++
|
||||
}
|
||||
}
|
||||
if sent == 0 {
|
||||
t.Fatalf("0 series sent")
|
||||
}
|
||||
if err := client.Close(); err != nil {
|
||||
t.Fatalf("failed to close client: %s", err)
|
||||
}
|
||||
got := testSrv.accepted()
|
||||
if got != sent {
|
||||
t.Fatalf("expected to have %d series; got %d", sent, got)
|
||||
}
|
||||
}
|
||||
@@ -30,7 +30,7 @@ var (
|
||||
|
||||
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
|
||||
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines max number of timeseries to be flushed at once")
|
||||
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
|
||||
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote write endpoint")
|
||||
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
|
||||
|
||||
@@ -1,320 +1,13 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with a HTTP 2xx status code when the write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
// RWClient represents an HTTP client for pushing data via remote write protocol
|
||||
type RWClient interface {
|
||||
// Push pushes the give time series to remote storage
|
||||
Push(s prompbmarshal.TimeSeries) error
|
||||
// Close stops the client. Client can't be reused after Close call.
|
||||
Close() error
|
||||
}
|
||||
|
||||
@@ -1,19 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -33,17 +30,17 @@ var (
|
||||
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
|
||||
)
|
||||
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
|
||||
if *replayMaxDatapoints < 1 {
|
||||
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
|
||||
}
|
||||
tFrom, err := time.Parse(time.RFC3339, *replayFrom)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse %q: %s", *replayFrom, err)
|
||||
return fmt.Errorf("failed to parse %q: %w", *replayFrom, err)
|
||||
}
|
||||
tTo, err := time.Parse(time.RFC3339, *replayTo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse %q: %s", *replayTo, err)
|
||||
return fmt.Errorf("failed to parse %q: %w", *replayTo, err)
|
||||
}
|
||||
if !tTo.After(tFrom) {
|
||||
return fmt.Errorf("replay.timeTo must be bigger than replay.timeFrom")
|
||||
@@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
|
||||
|
||||
var total int
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, qb, *evaluationInterval, labels)
|
||||
total += ng.replay(tFrom, tTo, rw)
|
||||
ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
|
||||
total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
|
||||
}
|
||||
logger.Infof("replay finished! Imported %d samples", total)
|
||||
if rw != nil {
|
||||
@@ -77,97 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(*replayMaxDatapoints)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
iterations := int(end.Sub(start)/step) + 1
|
||||
fmt.Printf("\nGroup %q"+
|
||||
"\ninterval: \t%v"+
|
||||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, iterations, step)
|
||||
if g.Limit > 0 {
|
||||
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
|
||||
g.Limit)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
var bar *pb.ProgressBar
|
||||
if !*disableProgressBar {
|
||||
bar = pb.StartNew(iterations)
|
||||
}
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
total += n
|
||||
if bar != nil {
|
||||
bar.Increment()
|
||||
}
|
||||
}
|
||||
if bar != nil {
|
||||
bar.Finish()
|
||||
}
|
||||
// sleep to let remote storage to flush data on-disk
|
||||
// so chained rules could be calculated correctly
|
||||
time.Sleep(*replayRulesDelay)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < *replayRuleRetryAttempts; i++ {
|
||||
tss, err = rule.ExecRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
if err != nil { // means all attempts failed
|
||||
return 0, err
|
||||
}
|
||||
if len(tss) < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
var n int
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
return n, fmt.Errorf("remote write failure: %s", err)
|
||||
}
|
||||
n += len(ts.Samples)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
type rangeIterator struct {
|
||||
step time.Duration
|
||||
start, end time.Time
|
||||
|
||||
iter int
|
||||
s, e time.Time
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) reset() {
|
||||
ri.iter = 0
|
||||
ri.s, ri.e = time.Time{}, time.Time{}
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) next() bool {
|
||||
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
|
||||
if !ri.end.After(ri.s) {
|
||||
return false
|
||||
}
|
||||
ri.e = ri.s.Add(ri.step)
|
||||
if ri.e.After(ri.end) {
|
||||
ri.e = ri.end
|
||||
}
|
||||
ri.iter++
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
)
|
||||
|
||||
type fakeReplayQuerier struct {
|
||||
fakeQuerier
|
||||
datasource.FakeQuerier
|
||||
registry map[string]map[string]struct{}
|
||||
}
|
||||
|
||||
@@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeIterator(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ri rangeIterator
|
||||
result [][2]time.Time
|
||||
}{
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 5 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 45 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
|
||||
step: time.Second,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
var j int
|
||||
for tc.ri.next() {
|
||||
if len(tc.result) < j+1 {
|
||||
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
|
||||
j, tc.ri.s, tc.ri.e)
|
||||
}
|
||||
s, e := tc.ri.s, tc.ri.e
|
||||
expS, expE := tc.result[j][0], tc.result[j][1]
|
||||
if s != expS {
|
||||
t.Fatalf("expected to get start=%v; got %v", expS, s)
|
||||
}
|
||||
if e != expE {
|
||||
t.Fatalf("expected to get end=%v; got %v", expE, e)
|
||||
}
|
||||
j++
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func parseTime(t *testing.T, s string) time.Time {
|
||||
t.Helper()
|
||||
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return tt
|
||||
}
|
||||
|
||||
@@ -1,118 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context at the given timestamp and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range.
|
||||
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
// ToAPI converts Rule into APIRule
|
||||
ToAPI() APIRule
|
||||
// Close performs the shutdown procedures for rule
|
||||
// such as metrics unregister
|
||||
Close()
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
|
||||
|
||||
type ruleState struct {
|
||||
sync.RWMutex
|
||||
entries []ruleStateEntry
|
||||
cur int
|
||||
}
|
||||
|
||||
type ruleStateEntry struct {
|
||||
// stores last moment of time rule.Exec was called
|
||||
time time.Time
|
||||
// stores the timesteamp with which rule.Exec was called
|
||||
at time.Time
|
||||
// stores the duration of the last rule.Exec call
|
||||
duration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health ruleState
|
||||
err error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
samples int
|
||||
// stores the number of time series fetched during
|
||||
// the last evaluation.
|
||||
// Is supported by VictoriaMetrics only, starting from v1.90.0
|
||||
// If seriesFetched == nil, then this attribute was missing in
|
||||
// datasource response (unsupported).
|
||||
seriesFetched *int
|
||||
// stores the curl command reflecting the HTTP request used during rule.Exec
|
||||
curl string
|
||||
}
|
||||
|
||||
func newRuleState(size int) *ruleState {
|
||||
if size < 1 {
|
||||
size = 1
|
||||
}
|
||||
return &ruleState{
|
||||
entries: make([]ruleStateEntry, size),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) getLast() ruleStateEntry {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return s.entries[s.cur]
|
||||
}
|
||||
|
||||
func (s *ruleState) size() int {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return len(s.entries)
|
||||
}
|
||||
|
||||
func (s *ruleState) getAll() []ruleStateEntry {
|
||||
entries := make([]ruleStateEntry, 0)
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
cur := s.cur
|
||||
for {
|
||||
e := s.entries[cur]
|
||||
if !e.time.IsZero() || !e.at.IsZero() {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
cur--
|
||||
if cur < 0 {
|
||||
cur = cap(s.entries) - 1
|
||||
}
|
||||
if cur == s.cur {
|
||||
return entries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) add(e ruleStateEntry) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.cur++
|
||||
if s.cur > cap(s.entries)-1 {
|
||||
s.cur = 0
|
||||
}
|
||||
s.entries[s.cur] = e
|
||||
}
|
||||
@@ -1,11 +1,10 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
|
||||
seriesFetched *utils.Gauge
|
||||
}
|
||||
|
||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
// NewAlertingRule creates a new AlertingRule
|
||||
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
ar := &AlertingRule{
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
@@ -80,13 +80,18 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
metrics: &alertingRuleMetrics{},
|
||||
}
|
||||
|
||||
entrySize := *ruleUpdateEntriesLimit
|
||||
if cfg.UpdateEntriesLimit != nil {
|
||||
ar.state = newRuleState(*cfg.UpdateEntriesLimit)
|
||||
} else {
|
||||
ar.state = newRuleState(*ruleUpdateEntriesLimit)
|
||||
entrySize = *cfg.UpdateEntriesLimit
|
||||
}
|
||||
if entrySize < 1 {
|
||||
entrySize = 1
|
||||
}
|
||||
ar.state = &ruleState{
|
||||
entries: make([]StateEntry, entrySize),
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, file=%q, id="%d"`, ar.Name, group.Name, group.File, ar.ID())
|
||||
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.alertsMu.RLock()
|
||||
@@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
if e.err == nil {
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
@@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
return float64(e.samples)
|
||||
return float64(e.Samples)
|
||||
})
|
||||
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
if e.seriesFetched == nil {
|
||||
if e.SeriesFetched == nil {
|
||||
// means seriesFetched is unsupported
|
||||
return -1
|
||||
}
|
||||
seriesFetched := float64(*e.seriesFetched)
|
||||
if seriesFetched == 0 && e.samples > 0 {
|
||||
seriesFetched := float64(*e.SeriesFetched)
|
||||
if seriesFetched == 0 && e.Samples > 0 {
|
||||
// `alert: 0.95` will fetch no series
|
||||
// but will get one time series in response.
|
||||
seriesFetched = float64(e.samples)
|
||||
seriesFetched = float64(e.Samples)
|
||||
}
|
||||
return seriesFetched
|
||||
})
|
||||
return ar
|
||||
}
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (ar *AlertingRule) Close() {
|
||||
// close unregisters rule metrics
|
||||
func (ar *AlertingRule) close() {
|
||||
ar.metrics.active.Unregister()
|
||||
ar.metrics.pending.Unregister()
|
||||
ar.metrics.errors.Unregister()
|
||||
@@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
// GetAlerts returns active alerts of rule
|
||||
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
var alerts []*notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, a)
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
// GetAlert returns alert if id exists
|
||||
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
if ar.alerts == nil {
|
||||
return nil
|
||||
}
|
||||
return ar.alerts[id]
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
|
||||
if !ar.Debug {
|
||||
return
|
||||
@@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
|
||||
logger.Infof("%s", prefix+msg)
|
||||
}
|
||||
|
||||
// updateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) updateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.KeepFiringFor = nr.KeepFiringFor
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
ar.Debug = nr.Debug
|
||||
ar.q = nr.q
|
||||
ar.state = nr.state
|
||||
return nil
|
||||
}
|
||||
|
||||
type labelSet struct {
|
||||
// origin labels extracted from received time series
|
||||
// plus extra labels (group labels, service labels like alertNameLabel).
|
||||
@@ -223,7 +269,7 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
|
||||
Expr: ar.Expr,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, fmt.Errorf("failed to expand labels: %w", err)
|
||||
}
|
||||
for k, v := range extraLabels {
|
||||
ls.processed[k] = v
|
||||
@@ -248,25 +294,34 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
|
||||
return ls, nil
|
||||
}
|
||||
|
||||
// ExecRange executes alerting rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
||||
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
// execRange executes alerting rule on the given time range similarly to exec.
|
||||
// When making consecutive calls make sure to respect time linearity for start and end params,
|
||||
// as this function modifies AlertingRule alerts state.
|
||||
// It is not thread safe.
|
||||
// It returns ALERT and ALERT_FOR_STATE time series as a result.
|
||||
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var result []prompbmarshal.TimeSeries
|
||||
holdAlertState := make(map[uint64]*notifier.Alert)
|
||||
qFn := func(query string) ([]datasource.Metric, error) {
|
||||
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
||||
}
|
||||
for _, s := range res.Data {
|
||||
ls, err := ar.toLabels(s, qFn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
h := hash(ls.processed)
|
||||
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create alert: %s", err)
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
if ar.For == 0 { // if alert is instant
|
||||
|
||||
// if alert is instant, For: 0
|
||||
if ar.For == 0 {
|
||||
a.State = notifier.StateFiring
|
||||
for i := range s.Values {
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
@@ -278,18 +333,32 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
||||
prevT := time.Time{}
|
||||
for i := range s.Values {
|
||||
at := time.Unix(s.Timestamps[i], 0)
|
||||
// try to restore alert's state on the first iteration
|
||||
if at.Equal(start) {
|
||||
if _, ok := ar.alerts[h]; ok {
|
||||
a = ar.alerts[h]
|
||||
prevT = at
|
||||
}
|
||||
}
|
||||
if at.Sub(prevT) > ar.EvalInterval {
|
||||
// reset to Pending if there are gaps > EvalInterval between DPs
|
||||
a.State = notifier.StatePending
|
||||
a.ActiveAt = at
|
||||
} else if at.Sub(a.ActiveAt) >= ar.For {
|
||||
a.Start = time.Time{}
|
||||
} else if at.Sub(a.ActiveAt) >= ar.For && a.State != notifier.StateFiring {
|
||||
a.State = notifier.StateFiring
|
||||
a.Start = at
|
||||
}
|
||||
prevT = at
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
|
||||
// save alert's state on last iteration, so it can be used on the next execRange call
|
||||
if at.Equal(end) {
|
||||
holdAlertState[h] = a
|
||||
}
|
||||
}
|
||||
}
|
||||
ar.alerts = holdAlertState
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@@ -297,19 +366,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
||||
// is kept in memory state and consequently repeatedly sent to the AlertManager.
|
||||
const resolvedRetention = 15 * time.Minute
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(res.Data),
|
||||
seriesFetched: res.SeriesFetched,
|
||||
err: err,
|
||||
curl: requestToCurl(req),
|
||||
curState := StateEntry{
|
||||
Time: start,
|
||||
At: ts,
|
||||
Duration: time.Since(start),
|
||||
Samples: len(res.Data),
|
||||
SeriesFetched: res.SeriesFetched,
|
||||
Err: err,
|
||||
Curl: requestToCurl(req),
|
||||
}
|
||||
|
||||
defer func() {
|
||||
@@ -323,7 +392,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
@@ -342,15 +411,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||
for _, m := range res.Data {
|
||||
ls, err := ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to expand labels: %w", err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
h := hash(ls.processed)
|
||||
if _, ok := updated[h]; ok {
|
||||
// duplicate may be caused by extra labels
|
||||
// conflicting with the metric labels
|
||||
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.Err
|
||||
}
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
@@ -373,8 +442,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||
}
|
||||
a, err := ar.newAlert(m, ls, start, qFn)
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to create alert: %w", err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to create alert: %w", err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
@@ -423,8 +492,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||
}
|
||||
if limit > 0 && numActivePending > limit {
|
||||
ar.alerts = map[uint64]*notifier.Alert{}
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
return nil, curState.Err
|
||||
}
|
||||
return ar.toTimeSeries(ts.Unix()), nil
|
||||
}
|
||||
@@ -441,26 +510,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
|
||||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.KeepFiringFor = nr.KeepFiringFor
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
ar.Debug = nr.Debug
|
||||
ar.q = nr.q
|
||||
ar.state = nr.state
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(labels map[string]string) uint64 {
|
||||
hash := fnv.New64a()
|
||||
@@ -487,7 +536,7 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
|
||||
if ls == nil {
|
||||
ls, err = ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, fmt.Errorf("failed to expand labels: %w", err)
|
||||
}
|
||||
}
|
||||
a := ¬ifier.Alert{
|
||||
@@ -503,102 +552,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
|
||||
return a, err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// ToAPI returns Rule representation in form of APIRule
|
||||
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
|
||||
func (ar *AlertingRule) ToAPI() APIRule {
|
||||
lastState := ar.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "alerting",
|
||||
DatasourceType: ar.Type.String(),
|
||||
Name: ar.Name,
|
||||
Query: ar.Expr,
|
||||
Duration: ar.For.Seconds(),
|
||||
KeepFiringFor: ar.KeepFiringFor.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
State: "inactive",
|
||||
Alerts: ar.AlertsToAPI(),
|
||||
LastSamples: lastState.samples,
|
||||
LastSeriesFetched: lastState.seriesFetched,
|
||||
MaxUpdates: ar.state.size(),
|
||||
Updates: ar.state.getAll(),
|
||||
Debug: ar.Debug,
|
||||
|
||||
// encode as strings to avoid rounding in JSON
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
}
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
// satisfy APIRule.State logic
|
||||
if len(r.Alerts) > 0 {
|
||||
r.State = notifier.StatePending.String()
|
||||
stateFiring := notifier.StateFiring.String()
|
||||
for _, a := range r.Alerts {
|
||||
if a.State == stateFiring {
|
||||
r.State = stateFiring
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// AlertsToAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.alertsMu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.alertsMu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
aa := &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
RuleID: fmt.Sprintf("%d", ar.RuleID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.ActiveAt,
|
||||
Restored: a.Restored,
|
||||
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
|
||||
}
|
||||
if alertURLGeneratorFn != nil {
|
||||
aa.SourceLink = alertURLGeneratorFn(a)
|
||||
}
|
||||
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
|
||||
aa.Stabilizing = true
|
||||
}
|
||||
return aa
|
||||
}
|
||||
|
||||
const (
|
||||
// alertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
@@ -646,10 +599,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
|
||||
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
|
||||
}
|
||||
|
||||
// Restore restores the value of ActiveAt field for active alerts,
|
||||
// restore restores the value of ActiveAt field for active alerts,
|
||||
// based on previously written time series `alertForStateMetricName`.
|
||||
// Only rules with For > 0 can be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
|
||||
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
|
||||
if ar.For < 1 {
|
||||
return nil
|
||||
}
|
||||
@@ -661,44 +614,41 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts ti
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, a := range ar.alerts {
|
||||
nameStr := fmt.Sprintf("%s=%q", alertNameLabel, ar.Name)
|
||||
if !*disableAlertGroupLabel {
|
||||
nameStr = fmt.Sprintf("%s=%q,%s=%q", alertGroupNameLabel, ar.GroupName, alertNameLabel, ar.Name)
|
||||
}
|
||||
var labelsFilter string
|
||||
for k, v := range ar.Labels {
|
||||
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
|
||||
}
|
||||
expr := fmt.Sprintf("last_over_time(%s{%s%s}[%ds])",
|
||||
alertForStateMetricName, nameStr, labelsFilter, int(lookback.Seconds()))
|
||||
|
||||
res, _, err := q.Query(ctx, expr, ts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to execute restore query %q: %w ", expr, err)
|
||||
}
|
||||
|
||||
if len(res.Data) < 1 {
|
||||
ar.logDebugf(ts, nil, "no response was received from restore query")
|
||||
return nil
|
||||
}
|
||||
for _, series := range res.Data {
|
||||
series.DelLabel("__name__")
|
||||
labelSet := make(map[string]string, len(series.Labels))
|
||||
for _, v := range series.Labels {
|
||||
labelSet[v.Name] = v.Value
|
||||
}
|
||||
id := hash(labelSet)
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if a.Restored || a.State != notifier.StatePending {
|
||||
continue
|
||||
}
|
||||
|
||||
var labelsFilter []string
|
||||
for k, v := range a.Labels {
|
||||
labelsFilter = append(labelsFilter, fmt.Sprintf("%s=%q", k, v))
|
||||
}
|
||||
sort.Strings(labelsFilter)
|
||||
expr := fmt.Sprintf("last_over_time(%s{%s}[%ds])",
|
||||
alertForStateMetricName, strings.Join(labelsFilter, ","), int(lookback.Seconds()))
|
||||
|
||||
ar.logDebugf(ts, nil, "restoring alert state via query %q", expr)
|
||||
|
||||
res, _, err := q.Query(ctx, expr, ts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
qMetrics := res.Data
|
||||
if len(qMetrics) < 1 {
|
||||
ar.logDebugf(ts, nil, "no response was received from restore query")
|
||||
continue
|
||||
}
|
||||
|
||||
// only one series expected in response
|
||||
m := qMetrics[0]
|
||||
// __name__ supposed to be alertForStateMetricName
|
||||
m.DelLabel("__name__")
|
||||
|
||||
// we assume that restore query contains all label matchers,
|
||||
// so all received labels will match anyway if their number is equal.
|
||||
if len(m.Labels) != len(a.Labels) {
|
||||
ar.logDebugf(ts, nil, "state restore query returned not expected label-set %v", m.Labels)
|
||||
continue
|
||||
}
|
||||
a.ActiveAt = time.Unix(int64(m.Values[0]), 0)
|
||||
a.ActiveAt = time.Unix(int64(series.Values[0]), 0)
|
||||
a.Restored = true
|
||||
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for i, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
fq.Reset()
|
||||
fq.Add(step...)
|
||||
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
@@ -346,15 +346,18 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
fakeGroup := Group{Name: "TestRule_ExecRange"}
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
data []datasource.Metric
|
||||
expAlerts []*notifier.Alert
|
||||
rule *AlertingRule
|
||||
data []datasource.Metric
|
||||
expAlerts []*notifier.Alert
|
||||
expHoldAlertStateAlerts map[uint64]*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestAlertingRule("empty", 0),
|
||||
[]datasource.Metric{},
|
||||
nil,
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
@@ -364,6 +367,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StateFiring},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
@@ -376,6 +380,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
State: notifier.StateFiring,
|
||||
},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing-on-range", 0),
|
||||
@@ -387,6 +392,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{State: notifier.StateFiring},
|
||||
{State: notifier.StateFiring},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending", time.Second),
|
||||
@@ -398,6 +404,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
|
||||
},
|
||||
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
|
||||
GroupID: fakeGroup.ID(),
|
||||
Name: "for-pending",
|
||||
Labels: map[string]string{"alertname": "for-pending"},
|
||||
Annotations: map[string]string{},
|
||||
State: notifier.StatePending,
|
||||
ActiveAt: time.Unix(5, 0),
|
||||
Value: 1,
|
||||
For: time.Second,
|
||||
}},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-firing", 3*time.Second),
|
||||
@@ -409,6 +425,38 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
},
|
||||
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
|
||||
GroupID: fakeGroup.ID(),
|
||||
Name: "for-firing",
|
||||
Labels: map[string]string{"alertname": "for-firing"},
|
||||
Annotations: map[string]string{},
|
||||
State: notifier.StateFiring,
|
||||
ActiveAt: time.Unix(1, 0),
|
||||
Start: time.Unix(5, 0),
|
||||
Value: 1,
|
||||
For: 3 * time.Second,
|
||||
}},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-hold-pending", time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
|
||||
},
|
||||
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
|
||||
GroupID: fakeGroup.ID(),
|
||||
Name: "for-hold-pending",
|
||||
Labels: map[string]string{"alertname": "for-hold-pending"},
|
||||
Annotations: map[string]string{},
|
||||
State: notifier.StatePending,
|
||||
ActiveAt: time.Unix(5, 0),
|
||||
Value: 1,
|
||||
For: time.Second,
|
||||
}},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
|
||||
@@ -422,12 +470,14 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
|
||||
newTestAlertingRule("multi-series", 3*time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
{
|
||||
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
|
||||
},
|
||||
},
|
||||
@@ -435,22 +485,49 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||
//
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
|
||||
{
|
||||
State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
|
||||
},
|
||||
},
|
||||
{
|
||||
State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{"alertname": "multi-series"}): {
|
||||
GroupID: fakeGroup.ID(),
|
||||
Name: "multi-series",
|
||||
Labels: map[string]string{"alertname": "multi-series"},
|
||||
Annotations: map[string]string{},
|
||||
State: notifier.StateFiring,
|
||||
ActiveAt: time.Unix(1, 0),
|
||||
Start: time.Unix(5, 0),
|
||||
Value: 1,
|
||||
For: 3 * time.Second,
|
||||
},
|
||||
hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
|
||||
GroupID: fakeGroup.ID(),
|
||||
Name: "multi-series",
|
||||
Labels: map[string]string{"alertname": "multi-series", "foo": "bar"},
|
||||
Annotations: map[string]string{},
|
||||
State: notifier.StatePending,
|
||||
ActiveAt: time.Unix(5, 0),
|
||||
Value: 1,
|
||||
For: 3 * time.Second,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRuleWithLabels("multi-series-firing", "source", "vm"),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 100}},
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
{
|
||||
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
|
||||
},
|
||||
},
|
||||
@@ -471,16 +548,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
"source": "vm",
|
||||
}},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
fakeGroup := Group{Name: "TestRule_ExecRange"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.data...)
|
||||
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
fq.Add(tc.data...)
|
||||
gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
@@ -506,30 +583,35 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
|
||||
}
|
||||
}
|
||||
if tc.expHoldAlertStateAlerts != nil {
|
||||
if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
|
||||
t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroup_Restore(t *testing.T) {
|
||||
defaultTS := time.Now()
|
||||
fqr := &fakeQuerierWithRegistry{}
|
||||
fqr := &datasource.FakeQuerierWithRegistry{}
|
||||
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
|
||||
t.Helper()
|
||||
defer fqr.reset()
|
||||
defer fqr.Reset()
|
||||
|
||||
for _, r := range rules {
|
||||
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
|
||||
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
|
||||
}
|
||||
|
||||
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
|
||||
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }
|
||||
fg.start(context.Background(), nts, nil, fqr)
|
||||
nts := func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} }
|
||||
fg.Start(context.Background(), nts, nil, fqr)
|
||||
wg.Done()
|
||||
}()
|
||||
fg.close()
|
||||
fg.Close()
|
||||
wg.Wait()
|
||||
|
||||
gotAlerts := make(map[uint64]*notifier.Alert)
|
||||
@@ -558,6 +640,9 @@ func TestGroup_Restore(t *testing.T) {
|
||||
if got.ActiveAt != exp.ActiveAt {
|
||||
t.Fatalf("expected ActiveAt %v; got %v", exp.ActiveAt, got.ActiveAt)
|
||||
}
|
||||
if got.Name != exp.Name {
|
||||
t.Fatalf("expected alertname %q; got %q", exp.Name, got.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -573,44 +658,28 @@ func TestGroup_Restore(t *testing.T) {
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: defaultTS,
|
||||
},
|
||||
})
|
||||
fqr.reset()
|
||||
fqr.Reset()
|
||||
|
||||
// one active alert with state restore
|
||||
ts := time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
ActiveAt: ts},
|
||||
Name: "foo",
|
||||
ActiveAt: ts,
|
||||
},
|
||||
})
|
||||
|
||||
// two rules, two active alerts, one with state restored
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fn(
|
||||
[]config.Rule{
|
||||
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
|
||||
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
ActiveAt: defaultTS,
|
||||
},
|
||||
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
|
||||
ActiveAt: ts},
|
||||
})
|
||||
|
||||
// two rules, two active alerts, two with state restored
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
stateMetric("bar", ts))
|
||||
fn(
|
||||
[]config.Rule{
|
||||
@@ -619,74 +688,102 @@ func TestGroup_Restore(t *testing.T) {
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: defaultTS,
|
||||
},
|
||||
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
|
||||
Name: "bar",
|
||||
ActiveAt: ts,
|
||||
},
|
||||
})
|
||||
|
||||
// two rules, two active alerts, two with state restored
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
stateMetric("bar", ts))
|
||||
fn(
|
||||
[]config.Rule{
|
||||
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
|
||||
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: ts,
|
||||
},
|
||||
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
|
||||
ActiveAt: ts},
|
||||
Name: "bar",
|
||||
ActiveAt: ts,
|
||||
},
|
||||
})
|
||||
|
||||
// one active alert but wrong state restore
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
|
||||
stateMetric("wrong alert", ts))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: defaultTS,
|
||||
},
|
||||
})
|
||||
|
||||
// one active alert with labels
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
stateMetric("foo", ts, "env", "dev"))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: ts,
|
||||
},
|
||||
})
|
||||
|
||||
// one active alert with restore labels missmatch
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
stateMetric("foo", ts, "env", "dev", "team", "foo"))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
|
||||
Name: "foo",
|
||||
ActiveAt: defaultTS,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
ar := newTestAlertingRule("test", 0)
|
||||
ar.Labels = map[string]string{"job": "test"}
|
||||
ar.q = fq
|
||||
|
||||
// successful attempt
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.exec(context.TODO(), time.Now(), 0)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// label `job` will collide with rule extra label and will make both time series equal
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.exec(context.TODO(), time.Now(), 0)
|
||||
if !errors.Is(err, errDuplicate) {
|
||||
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
_, err = ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.SetErr(errors.New(expErr))
|
||||
_, err = ar.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -696,7 +793,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestAlertingRuleLimit(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
ar := newTestAlertingRule("test", 0)
|
||||
ar.Labels = map[string]string{"job": "test"}
|
||||
ar.q = fq
|
||||
@@ -728,15 +825,15 @@ func TestAlertingRuleLimit(t *testing.T) {
|
||||
err error
|
||||
timestamp = time.Now()
|
||||
)
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
|
||||
for _, testCase := range testCases {
|
||||
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit)
|
||||
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
|
||||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
}
|
||||
|
||||
func TestAlertingRule_Template(t *testing.T) {
|
||||
@@ -844,7 +941,8 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
hash(map[string]string{
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"instance": "foo"}): {
|
||||
"instance": "foo",
|
||||
}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
@@ -860,19 +958,18 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
tc.rule.q = fq
|
||||
tc.rule.state = newRuleState(10)
|
||||
fq.add(tc.metrics...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
|
||||
fq.Add(tc.metrics...)
|
||||
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
for hash, expAlert := range tc.expAlerts {
|
||||
gotAlert := tc.rule.alerts[hash]
|
||||
if gotAlert == nil {
|
||||
t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations)
|
||||
break
|
||||
}
|
||||
if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations)
|
||||
@@ -980,7 +1077,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
For: waitFor,
|
||||
EvalInterval: waitFor,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(10),
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
return &rule
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"net/url"
|
||||
@@ -11,7 +13,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
@@ -21,16 +23,34 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
|
||||
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
|
||||
"which by default is 4 times evaluationInterval of the parent group")
|
||||
evalDelay = flag.Duration("rule.evalDelay", 30*time.Second, "Adjustment of the `time` parameter for rule evaluation requests to compensate intentional data delay from the datasource."+
|
||||
"Normally, should be equal to `-search.latencyOffset` (cmd-line flag configured for VictoriaMetrics single-node or vmselect).")
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type config.Type
|
||||
Interval time.Duration
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type config.Type
|
||||
Interval time.Duration
|
||||
EvalOffset *time.Duration
|
||||
// EvalDelay will adjust timestamp for rule evaluation requests to compensate intentional query delay from datasource.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155
|
||||
EvalDelay *time.Duration
|
||||
Limit int
|
||||
Concurrency int
|
||||
Checksum string
|
||||
@@ -51,6 +71,9 @@ type Group struct {
|
||||
evalCancel context.CancelFunc
|
||||
|
||||
metrics *groupMetrics
|
||||
// evalAlignment will make the timestamp of group query
|
||||
// requests be aligned with interval
|
||||
evalAlignment *bool
|
||||
}
|
||||
|
||||
type groupMetrics struct {
|
||||
@@ -92,7 +115,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
|
||||
return r
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
// NewGroup returns a new group
|
||||
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
g := &Group{
|
||||
Type: cfg.Type,
|
||||
Name: cfg.Name,
|
||||
@@ -105,6 +129,7 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
|
||||
Headers: make(map[string]string),
|
||||
NotifierHeaders: make(map[string]string),
|
||||
Labels: cfg.Labels,
|
||||
evalAlignment: cfg.EvalAlignment,
|
||||
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
@@ -116,6 +141,12 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
|
||||
if g.Concurrency < 1 {
|
||||
g.Concurrency = 1
|
||||
}
|
||||
if cfg.EvalOffset != nil {
|
||||
g.EvalOffset = &cfg.EvalOffset.D
|
||||
}
|
||||
if cfg.EvalDelay != nil {
|
||||
g.EvalDelay = &cfg.EvalDelay.D
|
||||
}
|
||||
for _, h := range cfg.Headers {
|
||||
g.Headers[h.Key] = h.Value
|
||||
}
|
||||
@@ -145,11 +176,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(qb, g, rule)
|
||||
func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
|
||||
if r.Alert != "" {
|
||||
return NewAlertingRule(qb, g, r)
|
||||
}
|
||||
return newRecordingRule(qb, g, rule)
|
||||
return NewRecordingRule(qb, g, r)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
@@ -163,11 +194,15 @@ func (g *Group) ID() uint64 {
|
||||
hash.Write([]byte("\xff"))
|
||||
hash.Write([]byte(g.Name))
|
||||
hash.Write([]byte(g.Type.Get()))
|
||||
hash.Write([]byte(g.Interval.String()))
|
||||
if g.EvalOffset != nil {
|
||||
hash.Write([]byte(g.EvalOffset.String()))
|
||||
}
|
||||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
|
||||
// restore restores alerts state for group rules
|
||||
func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
@@ -183,7 +218,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
|
||||
Headers: g.Headers,
|
||||
Debug: ar.Debug,
|
||||
})
|
||||
if err := ar.Restore(ctx, q, ts, lookback); err != nil {
|
||||
if err := ar.restore(ctx, q, ts, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
@@ -193,7 +228,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
|
||||
// updateWith updates existing group with
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// in group.Start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
@@ -206,11 +241,11 @@ func (g *Group) updateWith(newGroup *Group) error {
|
||||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// so we mark it for removing
|
||||
g.Rules[i].Close()
|
||||
g.Rules[i].close()
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
if err := or.updateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
@@ -243,10 +278,10 @@ func (g *Group) updateWith(newGroup *Group) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// interruptEval interrupts in-flight rules evaluations
|
||||
// InterruptEval interrupts in-flight rules evaluations
|
||||
// within the group. It is expected that g.evalCancel
|
||||
// will be repopulated after the call.
|
||||
func (g *Group) interruptEval() {
|
||||
func (g *Group) InterruptEval() {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
|
||||
@@ -255,12 +290,13 @@ func (g *Group) interruptEval() {
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Group) close() {
|
||||
// Close stops the group and it's rules, unregisters group metrics
|
||||
func (g *Group) Close() {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
}
|
||||
close(g.doneCh)
|
||||
g.interruptEval()
|
||||
g.InterruptEval()
|
||||
<-g.finishedCh
|
||||
|
||||
g.metrics.iterationDuration.Unregister()
|
||||
@@ -268,24 +304,25 @@ func (g *Group) close() {
|
||||
g.metrics.iterationMissed.Unregister()
|
||||
g.metrics.iterationInterval.Unregister()
|
||||
for _, rule := range g.Rules {
|
||||
rule.Close()
|
||||
rule.close()
|
||||
}
|
||||
}
|
||||
|
||||
var skipRandSleepOnGroupStart bool
|
||||
// SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
|
||||
var SkipRandSleepOnGroupStart bool
|
||||
|
||||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
|
||||
// Start starts group's evaluation
|
||||
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += uint64(g.Interval)
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
||||
evalTS := time.Now()
|
||||
// sleep random duration to spread group rules evaluation
|
||||
// over time in order to reduce load on datasource.
|
||||
if !SkipRandSleepOnGroupStart {
|
||||
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
|
||||
g.infof("will start in %v", sleepBeforeStart)
|
||||
|
||||
sleepTimer := time.NewTimer(sleepBeforeStart)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
@@ -295,18 +332,17 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
}
|
||||
evalTS = evalTS.Add(sleepBeforeStart)
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: nts,
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
|
||||
evalTS := time.Now()
|
||||
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
g.infof("started")
|
||||
|
||||
eval := func(ctx context.Context, ts time.Time) {
|
||||
g.metrics.iterationTotal.Inc()
|
||||
@@ -320,6 +356,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
||||
}
|
||||
|
||||
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
|
||||
ts = g.adjustReqTimestamp(ts)
|
||||
errs := e.execConcurrently(ctx, g.Rules, ts, g.Concurrency, resolveDuration, g.Limit)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
@@ -344,7 +381,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
||||
// restore the rules state after the first evaluation
|
||||
// so only active alerts can be restored.
|
||||
if rr != nil {
|
||||
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack)
|
||||
err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
|
||||
}
|
||||
@@ -375,19 +412,12 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
||||
continue
|
||||
}
|
||||
|
||||
// ensure that staleness is tracked or existing rules only
|
||||
// ensure that staleness is tracked for existing rules only
|
||||
e.purgeStaleSeries(g.Rules)
|
||||
|
||||
e.notifierHeaders = g.NotifierHeaders
|
||||
|
||||
if g.Interval != ng.Interval {
|
||||
g.Interval = ng.Interval
|
||||
t.Stop()
|
||||
t = time.NewTicker(g.Interval)
|
||||
evalTS = time.Now()
|
||||
}
|
||||
g.mu.Unlock()
|
||||
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
|
||||
g.infof("re-started")
|
||||
case <-t.C:
|
||||
missed := (time.Since(evalTS) / g.Interval) - 1
|
||||
if missed < 0 {
|
||||
@@ -405,6 +435,134 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateWith inserts new group to updateCh
|
||||
func (g *Group) UpdateWith(new *Group) {
|
||||
g.updateCh <- new
|
||||
}
|
||||
|
||||
// DeepCopy returns a deep copy of group
|
||||
func (g *Group) DeepCopy() *Group {
|
||||
g.mu.RLock()
|
||||
data, _ := json.Marshal(g)
|
||||
g.mu.RUnlock()
|
||||
newG := Group{}
|
||||
_ = json.Unmarshal(data, &newG)
|
||||
newG.Rules = g.Rules
|
||||
return &newG
|
||||
}
|
||||
|
||||
// delayBeforeStart returns a duration on the interval between [ts..ts+interval].
|
||||
// delayBeforeStart accounts for `offset`, so returned duration should be always
|
||||
// bigger than the `offset`.
|
||||
func delayBeforeStart(ts time.Time, key uint64, interval time.Duration, offset *time.Duration) time.Duration {
|
||||
var randSleep time.Duration
|
||||
randSleep = time.Duration(float64(interval) * (float64(key) / (1 << 64)))
|
||||
sleepOffset := time.Duration(ts.UnixNano() % interval.Nanoseconds())
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += interval
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
// check if `ts` after randSleep is before `offset`,
|
||||
// if it is, add extra eval_offset to randSleep.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409.
|
||||
if offset != nil {
|
||||
tmpEvalTS := ts.Add(randSleep)
|
||||
if tmpEvalTS.Before(tmpEvalTS.Truncate(interval).Add(*offset)) {
|
||||
randSleep += *offset
|
||||
}
|
||||
}
|
||||
return randSleep
|
||||
}
|
||||
|
||||
func (g *Group) infof(format string, args ...interface{}) {
|
||||
msg := fmt.Sprintf(format, args...)
|
||||
logger.Infof("group %q %s; interval=%v; eval_offset=%v; concurrency=%d",
|
||||
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
|
||||
}
|
||||
|
||||
// Replay performs group replay
|
||||
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(maxDataPoint)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
iterations := int(end.Sub(start)/step) + 1
|
||||
fmt.Printf("\nGroup %q"+
|
||||
"\ninterval: \t%v"+
|
||||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, iterations, step)
|
||||
if g.Limit > 0 {
|
||||
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
|
||||
g.Limit)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
var bar *pb.ProgressBar
|
||||
if !disableProgressBar {
|
||||
bar = pb.StartNew(iterations)
|
||||
}
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
total += n
|
||||
if bar != nil {
|
||||
bar.Increment()
|
||||
}
|
||||
}
|
||||
if bar != nil {
|
||||
bar.Finish()
|
||||
}
|
||||
// sleep to let remote storage to flush data on-disk
|
||||
// so chained rules could be calculated correctly
|
||||
time.Sleep(replayDelay)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// ExecOnce evaluates all the rules under group for once with given timestamp.
|
||||
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
|
||||
e := &executor{
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
if len(g.Rules) < 1 {
|
||||
return nil
|
||||
}
|
||||
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
|
||||
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
|
||||
}
|
||||
|
||||
type rangeIterator struct {
|
||||
step time.Duration
|
||||
start, end time.Time
|
||||
|
||||
iter int
|
||||
s, e time.Time
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) reset() {
|
||||
ri.iter = 0
|
||||
ri.s, ri.e = time.Time{}, time.Time{}
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) next() bool {
|
||||
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
|
||||
if !ri.end.After(ri.s) {
|
||||
return false
|
||||
}
|
||||
ri.e = ri.s.Add(ri.step)
|
||||
if ri.e.After(ri.end) {
|
||||
ri.e = ri.end
|
||||
}
|
||||
ri.iter++
|
||||
return true
|
||||
}
|
||||
|
||||
// getResolveDuration returns the duration after which firing alert
|
||||
// can be considered as resolved.
|
||||
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
|
||||
@@ -418,11 +576,49 @@ func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Du
|
||||
return resolveDuration
|
||||
}
|
||||
|
||||
func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
|
||||
if g.EvalOffset != nil {
|
||||
// calculate the min timestamp on the evaluationInterval
|
||||
intervalStart := timestamp.Truncate(g.Interval)
|
||||
ts := intervalStart.Add(*g.EvalOffset)
|
||||
if timestamp.Before(ts) {
|
||||
// if passed timestamp is before the expected evaluation offset,
|
||||
// then we should adjust it to the previous evaluation round.
|
||||
// E.g. request with evaluationInterval=1h and evaluationOffset=30m
|
||||
// was evaluated at 11:20. Then the timestamp should be adjusted
|
||||
// to 10:30, to the previous evaluationInterval.
|
||||
return ts.Add(-g.Interval)
|
||||
}
|
||||
// when `eval_offset` is using, ts shouldn't be effect by `eval_alignment` and `eval_delay`
|
||||
// since it should be always aligned.
|
||||
return ts
|
||||
}
|
||||
|
||||
timestamp = timestamp.Add(-g.getEvalDelay())
|
||||
|
||||
// always apply the alignment as a last step
|
||||
if g.evalAlignment == nil || *g.evalAlignment {
|
||||
// align query time with interval to get similar result with grafana when plotting time series.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049
|
||||
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
|
||||
return timestamp.Truncate(g.Interval)
|
||||
}
|
||||
return timestamp
|
||||
}
|
||||
|
||||
func (g *Group) getEvalDelay() time.Duration {
|
||||
if g.EvalDelay != nil {
|
||||
return *g.EvalDelay
|
||||
}
|
||||
return *evalDelay
|
||||
}
|
||||
|
||||
// executor contains group's notify and rw configs
|
||||
type executor struct {
|
||||
notifiers func() []notifier.Notifier
|
||||
Notifiers func() []notifier.Notifier
|
||||
notifierHeaders map[string]string
|
||||
|
||||
rw *remotewrite.Client
|
||||
Rw remotewrite.RWClient
|
||||
|
||||
previouslySentSeriesToRWMu sync.Mutex
|
||||
// previouslySentSeriesToRW stores series sent to RW on previous iteration
|
||||
@@ -432,6 +628,7 @@ type executor struct {
|
||||
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
|
||||
}
|
||||
|
||||
// execConcurrently executes rules concurrently if concurrency>1
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
if concurrency == 1 {
|
||||
@@ -446,14 +643,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
|
||||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, rule := range rules {
|
||||
for _, r := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, ts, resolveDuration, limit)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
}(r)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
@@ -466,15 +663,12 @@ var (
|
||||
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
||||
)
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
||||
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
||||
execTotal.Inc()
|
||||
|
||||
tss, err := rule.Exec(ctx, ts, limit)
|
||||
tss, err := r.exec(ctx, ts, limit)
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
// the context can be cancelled on graceful shutdown
|
||||
@@ -482,17 +676,15 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
||||
return nil
|
||||
}
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", r, err)
|
||||
}
|
||||
|
||||
if e.rw != nil {
|
||||
if e.Rw != nil {
|
||||
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
|
||||
var lastErr error
|
||||
for _, ts := range tss {
|
||||
remoteWriteTotal.Inc()
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
if err := e.Rw.Push(ts); err != nil {
|
||||
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
@@ -501,13 +693,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
||||
return err
|
||||
}
|
||||
|
||||
staleSeries := e.getStaleSeries(rule, tss, ts)
|
||||
staleSeries := e.getStaleSeries(r, tss, ts)
|
||||
if err := pushToRW(staleSeries); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
ar, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
@@ -519,11 +711,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers() {
|
||||
for _, nt := range e.Notifiers() {
|
||||
wg.Add(1)
|
||||
go func(nt notifier.Notifier) {
|
||||
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
|
||||
}
|
||||
wg.Done()
|
||||
}(nt)
|
||||
@@ -533,7 +725,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
||||
}
|
||||
|
||||
// getStaledSeries checks whether there are stale series from previously sent ones.
|
||||
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
|
||||
for _, ts := range tss {
|
||||
// convert labels to strings so we can compare with previously sent series
|
||||
@@ -541,7 +733,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
|
||||
ruleLabels[key] = ts.Labels
|
||||
}
|
||||
|
||||
rID := rule.ID()
|
||||
rID := r.ID()
|
||||
var staleS []prompbmarshal.TimeSeries
|
||||
// check whether there are series which disappeared and need to be marked as stale
|
||||
e.previouslySentSeriesToRWMu.Lock()
|
||||
@@ -1,16 +1,22 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
@@ -19,7 +25,15 @@ import (
|
||||
func init() {
|
||||
// Disable rand sleep on group start during tests in order to speed up test execution.
|
||||
// Rand sleep is needed only in prod code.
|
||||
skipRandSleepOnGroupStart = true
|
||||
SkipRandSleepOnGroupStart = true
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
fmt.Println("failed to load template for test")
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
@@ -35,18 +49,19 @@ func TestUpdateWith(t *testing.T) {
|
||||
},
|
||||
{
|
||||
"update alerting rule",
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
[]config.Rule{
|
||||
{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
},
|
||||
{
|
||||
Alert: "bar",
|
||||
Expr: "up > 0",
|
||||
@@ -54,7 +69,8 @@ func TestUpdateWith(t *testing.T) {
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
[]config.Rule{
|
||||
{
|
||||
Alert: "foo",
|
||||
@@ -75,7 +91,8 @@ func TestUpdateWith(t *testing.T) {
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
@@ -134,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Name: "test"}
|
||||
qb := &fakeQuerier{}
|
||||
qb := &datasource.FakeQuerier{}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(qb, r))
|
||||
@@ -166,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
if err := CompareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparison error: %s", err)
|
||||
}
|
||||
}
|
||||
@@ -175,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
|
||||
const (
|
||||
rules = `
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
`
|
||||
)
|
||||
var groups []config.Group
|
||||
err := yaml.Unmarshal([]byte(rules), &groups)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
|
||||
fs := &fakeQuerier{}
|
||||
fn := &fakeNotifier{}
|
||||
fs := &datasource.FakeQuerier{}
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
const inst1, inst2, job = "foo", "bar", "baz"
|
||||
@@ -200,7 +231,7 @@ func TestGroupStart(t *testing.T) {
|
||||
alert1.State = notifier.StateFiring
|
||||
// add external label
|
||||
alert1.Labels["cluster"] = "east-1"
|
||||
// add rule labels - see config/testdata/rules1-good.rules
|
||||
// add rule labels
|
||||
alert1.Labels["label"] = "bar"
|
||||
alert1.Labels["host"] = inst1
|
||||
// add service labels
|
||||
@@ -215,7 +246,7 @@ func TestGroupStart(t *testing.T) {
|
||||
alert2.State = notifier.StateFiring
|
||||
// add external label
|
||||
alert2.Labels["cluster"] = "east-1"
|
||||
// add rule labels - see config/testdata/rules1-good.rules
|
||||
// add rule labels
|
||||
alert2.Labels["label"] = "bar"
|
||||
alert2.Labels["host"] = inst2
|
||||
// add service labels
|
||||
@@ -224,40 +255,40 @@ func TestGroupStart(t *testing.T) {
|
||||
alert2.ID = hash(alert2.Labels)
|
||||
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
fs.Add(m1)
|
||||
fs.Add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
|
||||
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts := fn.getAlerts()
|
||||
gotAlerts := fn.GetAlerts()
|
||||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
gotAlertsNum := fn.getCounter()
|
||||
gotAlertsNum := fn.GetCounter()
|
||||
if gotAlertsNum < len(expectedAlerts)*2 {
|
||||
t.Fatalf("expected to receive at least %d alerts; got %d instead",
|
||||
len(expectedAlerts)*2, gotAlertsNum)
|
||||
}
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
fs.Reset()
|
||||
// and set only one datapoint for response
|
||||
fs.add(m1)
|
||||
fs.Add(m1)
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts = fn.getAlerts()
|
||||
gotAlerts = fn.GetAlerts()
|
||||
alert2.State = notifier.StateInactive
|
||||
expectedAlerts = []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
g.close()
|
||||
g.Close()
|
||||
<-finished
|
||||
}
|
||||
|
||||
@@ -292,13 +323,13 @@ func TestGetStaleSeries(t *testing.T) {
|
||||
e := &executor{
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
|
||||
f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
|
||||
t.Helper()
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, l := range labels {
|
||||
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
|
||||
}
|
||||
staleS := e.getStaleSeries(rule, tss, ts)
|
||||
staleS := e.getStaleSeries(r, tss, ts)
|
||||
if staleS == nil && expLabels == nil {
|
||||
return
|
||||
}
|
||||
@@ -434,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFaultyNotifier(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
|
||||
r := newTestAlertingRule("instant", 0)
|
||||
r.q = fq
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
e := &executor{
|
||||
notifiers: func() []notifier.Notifier {
|
||||
Notifiers: func() []notifier.Notifier {
|
||||
return []notifier.Notifier{
|
||||
&faultyNotifier{},
|
||||
¬ifier.FaultyNotifier{},
|
||||
fn,
|
||||
}
|
||||
},
|
||||
@@ -460,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
|
||||
tn := time.Now()
|
||||
deadline := tn.Add(delay / 2)
|
||||
for {
|
||||
if fn.getCounter() > 0 {
|
||||
if fn.GetCounter() > 0 {
|
||||
return
|
||||
}
|
||||
if tn.After(deadline) {
|
||||
@@ -473,17 +504,17 @@ func TestFaultyNotifier(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFaultyRW(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
|
||||
r := &RecordingRule{
|
||||
Name: "test",
|
||||
state: newRuleState(10),
|
||||
q: fq,
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: &remotewrite.Client{},
|
||||
Rw: &remotewrite.Client{},
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
|
||||
@@ -494,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestCloseWithEvalInterruption(t *testing.T) {
|
||||
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
|
||||
const (
|
||||
rules = `
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
`
|
||||
)
|
||||
var groups []config.Group
|
||||
err := yaml.Unmarshal([]byte(rules), &groups)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
|
||||
const delay = time.Second * 2
|
||||
fq := &fakeQuerierWithDelay{delay: delay}
|
||||
fq := &datasource.FakeQuerierWithDelay{Delay: delay}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fq, evalInterval, nil)
|
||||
g := NewGroup(groups[0], fq, evalInterval, nil)
|
||||
|
||||
go g.start(context.Background(), nil, nil, nil)
|
||||
go g.Start(context.Background(), nil, nil, nil)
|
||||
|
||||
time.Sleep(evalInterval * 20)
|
||||
|
||||
go func() {
|
||||
g.close()
|
||||
g.Close()
|
||||
}()
|
||||
|
||||
deadline := time.Tick(delay / 2)
|
||||
@@ -520,3 +566,225 @@ func TestCloseWithEvalInterruption(t *testing.T) {
|
||||
case <-g.finishedCh:
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroupStartDelay(t *testing.T) {
|
||||
g := &Group{}
|
||||
// interval of 5min and key generate a static delay of 30s
|
||||
g.Interval = time.Minute * 5
|
||||
key := uint64(math.MaxUint64 / 10)
|
||||
|
||||
f := func(atS, expS string) {
|
||||
t.Helper()
|
||||
at, err := time.Parse(time.RFC3339Nano, atS)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
expTS, err := time.Parse(time.RFC3339Nano, expS)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
delay := delayBeforeStart(at, key, g.Interval, g.EvalOffset)
|
||||
gotStart := at.Add(delay)
|
||||
if expTS != gotStart {
|
||||
t.Errorf("expected to get %v; got %v instead", expTS, gotStart)
|
||||
}
|
||||
}
|
||||
|
||||
// test group without offset
|
||||
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:00:30.000+00:00")
|
||||
f("2023-01-01T00:00:00.999+00:00", "2023-01-01T00:00:30.000+00:00")
|
||||
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:00:30.000+00:00")
|
||||
f("2023-01-01T00:00:31.000+00:00", "2023-01-01T00:05:30.000+00:00")
|
||||
|
||||
// test group with offset smaller than above fixed randSleep,
|
||||
// this way randSleep will always be enough
|
||||
offset := 20 * time.Second
|
||||
g.EvalOffset = &offset
|
||||
|
||||
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:00:30.000+00:00")
|
||||
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:00:30.000+00:00")
|
||||
f("2023-01-01T00:00:31.000+00:00", "2023-01-01T00:05:30.000+00:00")
|
||||
|
||||
// test group with offset bigger than above fixed randSleep,
|
||||
// this way offset will be added to delay
|
||||
offset = 3 * time.Minute
|
||||
g.EvalOffset = &offset
|
||||
|
||||
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:03:30.000+00:00")
|
||||
f("2023-01-01T00:00:29.000+00:00", "2023-01-01T00:03:30.000+00:00")
|
||||
f("2023-01-01T00:01:00.000+00:00", "2023-01-01T00:08:30.000+00:00")
|
||||
f("2023-01-01T00:03:30.000+00:00", "2023-01-01T00:08:30.000+00:00")
|
||||
f("2023-01-01T00:07:30.000+00:00", "2023-01-01T00:13:30.000+00:00")
|
||||
|
||||
offset = 10 * time.Minute
|
||||
g.EvalOffset = &offset
|
||||
// interval of 1h and key generate a static delay of 6m
|
||||
g.Interval = time.Hour
|
||||
|
||||
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:16:00.000+00:00")
|
||||
f("2023-01-01T00:05:00.000+00:00", "2023-01-01T00:16:00.000+00:00")
|
||||
f("2023-01-01T00:30:00.000+00:00", "2023-01-01T01:16:00.000+00:00")
|
||||
}
|
||||
|
||||
func TestGetPrometheusReqTimestamp(t *testing.T) {
|
||||
offset := 30 * time.Minute
|
||||
evalDelay := 1 * time.Minute
|
||||
disableAlign := false
|
||||
testCases := []struct {
|
||||
name string
|
||||
g *Group
|
||||
originTS, expTS string
|
||||
}{
|
||||
{
|
||||
"with query align + default evalDelay",
|
||||
&Group{
|
||||
Interval: time.Hour,
|
||||
},
|
||||
"2023-08-28T11:11:00+00:00",
|
||||
"2023-08-28T11:00:00+00:00",
|
||||
},
|
||||
{
|
||||
"without query align + default evalDelay",
|
||||
&Group{
|
||||
Interval: time.Hour,
|
||||
evalAlignment: &disableAlign,
|
||||
},
|
||||
"2023-08-28T11:11:00+00:00",
|
||||
"2023-08-28T11:10:30+00:00",
|
||||
},
|
||||
{
|
||||
"with eval_offset, find previous offset point + default evalDelay",
|
||||
&Group{
|
||||
EvalOffset: &offset,
|
||||
Interval: time.Hour,
|
||||
},
|
||||
"2023-08-28T11:11:00+00:00",
|
||||
"2023-08-28T10:30:00+00:00",
|
||||
},
|
||||
{
|
||||
"with eval_offset + default evalDelay",
|
||||
&Group{
|
||||
EvalOffset: &offset,
|
||||
Interval: time.Hour,
|
||||
},
|
||||
"2023-08-28T11:41:00+00:00",
|
||||
"2023-08-28T11:30:00+00:00",
|
||||
},
|
||||
{
|
||||
"1h interval with eval_delay",
|
||||
&Group{
|
||||
EvalDelay: &evalDelay,
|
||||
Interval: time.Hour,
|
||||
},
|
||||
"2023-08-28T11:41:00+00:00",
|
||||
"2023-08-28T11:00:00+00:00",
|
||||
},
|
||||
{
|
||||
"1m interval with eval_delay",
|
||||
&Group{
|
||||
EvalDelay: &evalDelay,
|
||||
Interval: time.Minute,
|
||||
},
|
||||
"2023-08-28T11:41:13+00:00",
|
||||
"2023-08-28T11:40:00+00:00",
|
||||
},
|
||||
{
|
||||
"disable alignment with eval_delay",
|
||||
&Group{
|
||||
EvalDelay: &evalDelay,
|
||||
Interval: time.Hour,
|
||||
evalAlignment: &disableAlign,
|
||||
},
|
||||
"2023-08-28T11:41:00+00:00",
|
||||
"2023-08-28T11:40:00+00:00",
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
originT, _ := time.Parse(time.RFC3339, tc.originTS)
|
||||
expT, _ := time.Parse(time.RFC3339, tc.expTS)
|
||||
gotTS := tc.g.adjustReqTimestamp(originT)
|
||||
if !gotTS.Equal(expT) {
|
||||
t.Fatalf("get wrong prometheus request timestamp, expect %s, got %s", expT, gotTS)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeIterator(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ri rangeIterator
|
||||
result [][2]time.Time
|
||||
}{
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 5 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 45 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
|
||||
step: time.Second,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
var j int
|
||||
for tc.ri.next() {
|
||||
if len(tc.result) < j+1 {
|
||||
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
|
||||
j, tc.ri.s, tc.ri.e)
|
||||
}
|
||||
s, e := tc.ri.s, tc.ri.e
|
||||
expS, expE := tc.result[j][0], tc.result[j][1]
|
||||
if s != expS {
|
||||
t.Fatalf("expected to get start=%v; got %v", expS, s)
|
||||
}
|
||||
if e != expE {
|
||||
t.Fatalf("expected to get end=%v; got %v", expE, e)
|
||||
}
|
||||
j++
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func parseTime(t *testing.T, s string) time.Time {
|
||||
t.Helper()
|
||||
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return tt
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
|
||||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
||||
// NewRecordingRule creates a new RecordingRule
|
||||
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
||||
rr := &RecordingRule{
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
@@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
||||
}),
|
||||
}
|
||||
|
||||
entrySize := *ruleUpdateEntriesLimit
|
||||
if cfg.UpdateEntriesLimit != nil {
|
||||
rr.state = newRuleState(*cfg.UpdateEntriesLimit)
|
||||
} else {
|
||||
rr.state = newRuleState(*ruleUpdateEntriesLimit)
|
||||
entrySize = *cfg.UpdateEntriesLimit
|
||||
}
|
||||
if entrySize < 1 {
|
||||
entrySize = 1
|
||||
}
|
||||
rr.state = &ruleState{
|
||||
entries: make([]StateEntry, entrySize),
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID())
|
||||
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
if e.err == nil {
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
@@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
||||
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
return float64(e.samples)
|
||||
return float64(e.Samples)
|
||||
})
|
||||
return rr
|
||||
}
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (rr *RecordingRule) Close() {
|
||||
// close unregisters rule metrics
|
||||
func (rr *RecordingRule) close() {
|
||||
rr.metrics.errors.Unregister()
|
||||
rr.metrics.samples.Unregister()
|
||||
}
|
||||
|
||||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||
// execRange executes recording rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
// exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(res.Data),
|
||||
seriesFetched: res.SeriesFetched,
|
||||
curl: requestToCurl(req),
|
||||
curState := StateEntry{
|
||||
Time: start,
|
||||
At: ts,
|
||||
Duration: time.Since(start),
|
||||
Samples: len(res.Data),
|
||||
SeriesFetched: res.SeriesFetched,
|
||||
Curl: requestToCurl(req),
|
||||
}
|
||||
|
||||
defer func() {
|
||||
@@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
|
||||
}()
|
||||
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
|
||||
qMetrics := res.Data
|
||||
numSeries := len(qMetrics)
|
||||
if limit > 0 && numSeries > limit {
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
return nil, curState.Err
|
||||
}
|
||||
|
||||
duplicates := make(map[string]struct{}, len(qMetrics))
|
||||
@@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
|
||||
ts := rr.toTimeSeries(r)
|
||||
key := stringifyLabels(ts)
|
||||
if _, ok := duplicates[key]; ok {
|
||||
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
return nil, curState.Err
|
||||
}
|
||||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
@@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
|
||||
return newTimeSeries(m.Values, m.Timestamps, labels)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
// updateWith copies all significant fields.
|
||||
func (rr *RecordingRule) updateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
@@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
rr.q = nr.q
|
||||
return nil
|
||||
}
|
||||
|
||||
// ToAPI returns Rule's representation in form
|
||||
// of APIRule
|
||||
func (rr *RecordingRule) ToAPI() APIRule {
|
||||
lastState := rr.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "recording",
|
||||
DatasourceType: rr.Type.String(),
|
||||
Name: rr.Name,
|
||||
Query: rr.Expr,
|
||||
Labels: rr.Labels,
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
LastSamples: lastState.samples,
|
||||
LastSeriesFetched: lastState.seriesFetched,
|
||||
MaxUpdates: rr.state.size(),
|
||||
Updates: rr.state.getAll(),
|
||||
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
}
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return r
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
|
||||
Name: "job:foo",
|
||||
Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
},
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
@@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tc.rule.state = newRuleState(10)
|
||||
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0)
|
||||
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
|
||||
tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
@@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
@@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
@@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
|
||||
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
|
||||
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
|
||||
}
|
||||
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{
|
||||
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
|
||||
"source": "test_limit",
|
||||
}}
|
||||
var err error
|
||||
for _, testCase := range testCases {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(testMetrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(testMetrics...)
|
||||
rule.q = fq
|
||||
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit)
|
||||
_, err = rule.exec(context.TODO(), timestamp, testCase.limit)
|
||||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -215,18 +218,17 @@ func TestRecordingRuleLimit(t *testing.T) {
|
||||
|
||||
func TestRecordingRule_ExecNegative(t *testing.T) {
|
||||
rr := &RecordingRule{
|
||||
Name: "job:foo",
|
||||
state: newRuleState(10),
|
||||
Name: "job:foo",
|
||||
Labels: map[string]string{
|
||||
"job": "test",
|
||||
},
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
fq.SetErr(errors.New(expErr))
|
||||
rr.q = fq
|
||||
_, err := rr.Exec(context.TODO(), time.Now(), 0)
|
||||
_, err := rr.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), time.Now(), 0)
|
||||
_, err = rr.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
174
app/vmalert/rule/rule.go
Normal file
174
app/vmalert/rule/rule.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// exec executes the rule with given context at the given timestamp and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// execRange executes the rule on the given time range.
|
||||
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// updateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
updateWith(Rule) error
|
||||
// close performs the shutdown procedures for rule
|
||||
// such as metrics unregister
|
||||
close()
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
|
||||
|
||||
type ruleState struct {
|
||||
sync.RWMutex
|
||||
entries []StateEntry
|
||||
cur int
|
||||
}
|
||||
|
||||
// StateEntry stores rule's execution states
|
||||
type StateEntry struct {
|
||||
// stores last moment of time rule.Exec was called
|
||||
Time time.Time
|
||||
// stores the timesteamp with which rule.Exec was called
|
||||
At time.Time
|
||||
// stores the duration of the last rule.Exec call
|
||||
Duration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health ruleState
|
||||
Err error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
Samples int
|
||||
// stores the number of time series fetched during
|
||||
// the last evaluation.
|
||||
// Is supported by VictoriaMetrics only, starting from v1.90.0
|
||||
// If seriesFetched == nil, then this attribute was missing in
|
||||
// datasource response (unsupported).
|
||||
SeriesFetched *int
|
||||
// stores the curl command reflecting the HTTP request used during rule.Exec
|
||||
Curl string
|
||||
}
|
||||
|
||||
// GetLastEntry returns latest stateEntry of rule
|
||||
func GetLastEntry(r Rule) StateEntry {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.getLast()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.getLast()
|
||||
}
|
||||
return StateEntry{}
|
||||
}
|
||||
|
||||
// GetRuleStateSize returns size of rule stateEntry
|
||||
func GetRuleStateSize(r Rule) int {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.size()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.size()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// GetAllRuleState returns rule entire stateEntries
|
||||
func GetAllRuleState(r Rule) []StateEntry {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.getAll()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.getAll()
|
||||
}
|
||||
return []StateEntry{}
|
||||
}
|
||||
|
||||
func (s *ruleState) size() int {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return len(s.entries)
|
||||
}
|
||||
|
||||
func (s *ruleState) getLast() StateEntry {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
if len(s.entries) == 0 {
|
||||
return StateEntry{}
|
||||
}
|
||||
return s.entries[s.cur]
|
||||
}
|
||||
|
||||
func (s *ruleState) getAll() []StateEntry {
|
||||
entries := make([]StateEntry, 0)
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
cur := s.cur
|
||||
for {
|
||||
e := s.entries[cur]
|
||||
if !e.Time.IsZero() || !e.At.IsZero() {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
cur--
|
||||
if cur < 0 {
|
||||
cur = cap(s.entries) - 1
|
||||
}
|
||||
if cur == s.cur {
|
||||
return entries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) add(e StateEntry) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.cur++
|
||||
if s.cur > cap(s.entries)-1 {
|
||||
s.cur = 0
|
||||
}
|
||||
s.entries[s.cur] = e
|
||||
}
|
||||
|
||||
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < replayRuleRetryAttempts; i++ {
|
||||
tss, err = r.execRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
if err != nil { // means all attempts failed
|
||||
return 0, err
|
||||
}
|
||||
if len(tss) < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
var n int
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
return n, fmt.Errorf("remote write failure: %w", err)
|
||||
}
|
||||
n += len(ts.Samples)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user