mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-07 19:06:17 +03:00
Compare commits
599 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61cc13c16f | ||
|
|
b060d8bf53 | ||
|
|
d472b03e34 | ||
|
|
682662b2ae | ||
|
|
2b0e3efa5c | ||
|
|
8d99e94a52 | ||
|
|
8d0ec47be9 | ||
|
|
2df66dad7b | ||
|
|
5bc240bffe | ||
|
|
244d0fe5d7 | ||
|
|
a925d5a3e1 | ||
|
|
f9de546139 | ||
|
|
4f80b2f230 | ||
|
|
f3a5465ece | ||
|
|
bfba4c28a4 | ||
|
|
a684408e27 | ||
|
|
8ca2799478 | ||
|
|
f539772ca6 | ||
|
|
8c764e88f0 | ||
|
|
2be340a4c9 | ||
|
|
c5f0b454f0 | ||
|
|
e9e35a7d6a | ||
|
|
92b8380fdf | ||
|
|
1152de4321 | ||
|
|
7e77980608 | ||
|
|
6e0553c92e | ||
|
|
ed944313b0 | ||
|
|
766edbc421 | ||
|
|
c55a64ba08 | ||
|
|
e843bd7bd7 | ||
|
|
8b262d4ba7 | ||
|
|
2f54559c89 | ||
|
|
a7694092b8 | ||
|
|
8aa9bba9bd | ||
|
|
7c6d3981bf | ||
|
|
78c9174682 | ||
|
|
f71e4d1853 | ||
|
|
f3acf065c9 | ||
|
|
0020b9f904 | ||
|
|
75f12bfe78 | ||
|
|
4cf47163c1 | ||
|
|
4f0003f182 | ||
|
|
d0c830039d | ||
|
|
8f973e34fb | ||
|
|
43103be011 | ||
|
|
54b9e1d3cb | ||
|
|
bd6b8f7e31 | ||
|
|
888d62e40c | ||
|
|
6bef227388 | ||
|
|
9a83e9018d | ||
|
|
f6bb130898 | ||
|
|
7088f17494 | ||
|
|
6e406083f2 | ||
|
|
a93da746c0 | ||
|
|
1e52f11e87 | ||
|
|
5beb099ff9 | ||
|
|
158c50c0ee | ||
|
|
02ed4b8b46 | ||
|
|
c25b839078 | ||
|
|
ae485c2bfd | ||
|
|
00bbe1608b | ||
|
|
a38a6fe8ad | ||
|
|
c93cee8de8 | ||
|
|
fc12484734 | ||
|
|
9ce211a514 | ||
|
|
5506cff76e | ||
|
|
1b0501a09e | ||
|
|
3af2162085 | ||
|
|
008033a374 | ||
|
|
cb5453953f | ||
|
|
a69045e440 | ||
|
|
c85a5b7fcb | ||
|
|
434e33da9b | ||
|
|
b50e2ec88c | ||
|
|
4345c07777 | ||
|
|
b2fca1ab22 | ||
|
|
906fca9e88 | ||
|
|
3c3694f72a | ||
|
|
d25a161579 | ||
|
|
ca42410afd | ||
|
|
5d64ed73c5 | ||
|
|
cdfae0117a | ||
|
|
70e2852376 | ||
|
|
94f3e40ab3 | ||
|
|
c22114c6f0 | ||
|
|
e8a5bb92b7 | ||
|
|
ac54f34f9e | ||
|
|
755040a171 | ||
|
|
59e7755df9 | ||
|
|
e307bbb29a | ||
|
|
58a2989fe7 | ||
|
|
3bc83d3b17 | ||
|
|
da8c901fab | ||
|
|
a3262daac0 | ||
|
|
83a4db813e | ||
|
|
4cfedc5931 | ||
|
|
570f36b344 | ||
|
|
eb1af09a04 | ||
|
|
cbd9159a22 | ||
|
|
fb72a2133f | ||
|
|
d8ab409418 | ||
|
|
6c434b260e | ||
|
|
dcbc22552f | ||
|
|
aa9b56a046 | ||
|
|
12a83d25bf | ||
|
|
9eb3fc346f | ||
|
|
6d17a4e12d | ||
|
|
0a796f7c3a | ||
|
|
fb4f758715 | ||
|
|
6a8369f0fc | ||
|
|
84fb59b0ba | ||
|
|
e028ad241a | ||
|
|
af90c3c43b | ||
|
|
36d55bff66 | ||
|
|
7b283ee91c | ||
|
|
a90012ef26 | ||
|
|
05bc9667c1 | ||
|
|
729c4eeb9c | ||
|
|
b8526e88d3 | ||
|
|
06b8e7d148 | ||
|
|
48210130ac | ||
|
|
3e5b6bae66 | ||
|
|
3c4366806c | ||
|
|
8a519f1518 | ||
|
|
6d5a8c28cd | ||
|
|
2ef2e3d6f8 | ||
|
|
ed83558646 | ||
|
|
c4f3fbfa5d | ||
|
|
d979e14da2 | ||
|
|
7adfe878e1 | ||
|
|
69b1482bdb | ||
|
|
044ab46824 | ||
|
|
67b17cdd68 | ||
|
|
b4008c1e65 | ||
|
|
b83a51366e | ||
|
|
10d105ee25 | ||
|
|
d0dca62026 | ||
|
|
46d2c7d640 | ||
|
|
2422b5091f | ||
|
|
329b6cd146 | ||
|
|
db1c548cb4 | ||
|
|
f93a31b490 | ||
|
|
ab15bf8c90 | ||
|
|
2a259ef5e7 | ||
|
|
408fc90b40 | ||
|
|
5e9f3777bf | ||
|
|
c16edf8287 | ||
|
|
6b29b955c0 | ||
|
|
28c44ef065 | ||
|
|
668165f53d | ||
|
|
c0feafefc8 | ||
|
|
8a7e6ad5cc | ||
|
|
645e18dd88 | ||
|
|
96b691a0ab | ||
|
|
661d2668f8 | ||
|
|
78f83dc5ad | ||
|
|
2fe045e2a4 | ||
|
|
5ac25d2585 | ||
|
|
6f19bb23a1 | ||
|
|
f963f04d3d | ||
|
|
2d8bd41f8a | ||
|
|
d2d746c4fc | ||
|
|
ddc8022702 | ||
|
|
de8e4b6223 | ||
|
|
b22e380a34 | ||
|
|
ffa4cd6fa5 | ||
|
|
9315e3ded3 | ||
|
|
cecbdee00d | ||
|
|
634eeac804 | ||
|
|
a52a20659a | ||
|
|
d088923aef | ||
|
|
793fe39921 | ||
|
|
60341722d5 | ||
|
|
f0c21b6300 | ||
|
|
eda6ee40b6 | ||
|
|
2bbb1cc7c1 | ||
|
|
7ecaa2fe2c | ||
|
|
7f531e3a60 | ||
|
|
d210958fd0 | ||
|
|
2233d6ed8a | ||
|
|
fd264477bf | ||
|
|
c78d7dde92 | ||
|
|
08234aa7a0 | ||
|
|
a47d4927d2 | ||
|
|
b1e8d92577 | ||
|
|
8e7d1f8824 | ||
|
|
39ef1e7a51 | ||
|
|
4b01c9fb2e | ||
|
|
a4ff4b8e65 | ||
|
|
a46551245c | ||
|
|
eafed5335d | ||
|
|
93d81b486d | ||
|
|
f54133b200 | ||
|
|
24858820b5 | ||
|
|
04eb37a590 | ||
|
|
ec79abc382 | ||
|
|
84cc0513e1 | ||
|
|
78dddfb98f | ||
|
|
5b8176c68e | ||
|
|
e05dd475f0 | ||
|
|
8e2985b53d | ||
|
|
d173a9348c | ||
|
|
14ec2b9f26 | ||
|
|
c54bb73867 | ||
|
|
d626c5c2a9 | ||
|
|
060664e221 | ||
|
|
49ecbc765d | ||
|
|
362a49bdd1 | ||
|
|
4c7bb75fa2 | ||
|
|
0d3e78b9ee | ||
|
|
480087944a | ||
|
|
49c39ab388 | ||
|
|
6cc6a032cd | ||
|
|
d06aae9454 | ||
|
|
e394ff6466 | ||
|
|
ad73f226ff | ||
|
|
7e526effaa | ||
|
|
3cd8606abd | ||
|
|
98e425ee09 | ||
|
|
dcf8803bbd | ||
|
|
22585531ad | ||
|
|
0842bb9294 | ||
|
|
009e136d88 | ||
|
|
f7e73fbe8b | ||
|
|
eb8093ca6b | ||
|
|
e8a6c6927d | ||
|
|
f4719889da | ||
|
|
b30925738b | ||
|
|
0cf1fe19e6 | ||
|
|
bfb61de606 | ||
|
|
3166994244 | ||
|
|
66aba00549 | ||
|
|
3339ea41e7 | ||
|
|
6c944b86d8 | ||
|
|
ec6a284978 | ||
|
|
2eed410466 | ||
|
|
ede2ba5a45 | ||
|
|
55ed77760b | ||
|
|
2ec6f81b10 | ||
|
|
b96b19f040 | ||
|
|
b38edec7ee | ||
|
|
733706e6c6 | ||
|
|
a15145e597 | ||
|
|
24cbf8b66c | ||
|
|
10a47af631 | ||
|
|
e6ec442e96 | ||
|
|
70165a6758 | ||
|
|
a422165dc6 | ||
|
|
fc3519fa26 | ||
|
|
1f75ae6006 | ||
|
|
b4f5be8bd8 | ||
|
|
e6fda03e8f | ||
|
|
f6a641de62 | ||
|
|
c0ec541559 | ||
|
|
d7be2753c0 | ||
|
|
b50024812e | ||
|
|
a22a17dc66 | ||
|
|
beddc0c0d5 | ||
|
|
832651c6c2 | ||
|
|
8a0678678b | ||
|
|
cfd6aa28e1 | ||
|
|
afec68ad13 | ||
|
|
f2d5c4e2d0 | ||
|
|
33f7bacb01 | ||
|
|
3fb3ce2a6d | ||
|
|
8b65920a8b | ||
|
|
5b986c95dd | ||
|
|
c687536956 | ||
|
|
229d9d6dd7 | ||
|
|
5c448126dc | ||
|
|
3b0966c00c | ||
|
|
4247168a2d | ||
|
|
6ff19096be | ||
|
|
cbd0569ce2 | ||
|
|
120927ab65 | ||
|
|
75c2c813fc | ||
|
|
f8d50e9641 | ||
|
|
7aea5f58c4 | ||
|
|
12d733dd5d | ||
|
|
237e9f9fd7 | ||
|
|
d6439490f5 | ||
|
|
79ec35cef9 | ||
|
|
d9e3872b1c | ||
|
|
4bea1afc6d | ||
|
|
904bbffc7f | ||
|
|
9cdd4696fe | ||
|
|
cdd1b06473 | ||
|
|
76027093ca | ||
|
|
ce9e163e94 | ||
|
|
988c3b386f | ||
|
|
477369b62f | ||
|
|
9a930720b3 | ||
|
|
0969b446b3 | ||
|
|
fb097ff774 | ||
|
|
4e2ea844ca | ||
|
|
58d1e6eeea | ||
|
|
508f500477 | ||
|
|
86bdb5ea95 | ||
|
|
0f988e5a31 | ||
|
|
afca7b430c | ||
|
|
4394dc6cbb | ||
|
|
f3a048288e | ||
|
|
6fa5981e68 | ||
|
|
2ab1266593 | ||
|
|
25b8d71df5 | ||
|
|
4e5a88114a | ||
|
|
15609ee447 | ||
|
|
87179c6839 | ||
|
|
56b6b893ce | ||
|
|
2ec7d8b384 | ||
|
|
f89c1f7f49 | ||
|
|
60947fb2d5 | ||
|
|
87018650dd | ||
|
|
d6f44977a7 | ||
|
|
00deb69e28 | ||
|
|
bf2439f962 | ||
|
|
20b77abc17 | ||
|
|
e9898e1772 | ||
|
|
52b4b1605e | ||
|
|
fb37b853e9 | ||
|
|
908e35affd | ||
|
|
eddba29664 | ||
|
|
ae37cfd528 | ||
|
|
f7d7236b44 | ||
|
|
478c56f281 | ||
|
|
758de04440 | ||
|
|
4c4c383f30 | ||
|
|
bbebdf9ba1 | ||
|
|
1ab27582a3 | ||
|
|
1fbc04facd | ||
|
|
9de0fa3649 | ||
|
|
7c4e460513 | ||
|
|
6bc52fe41a | ||
|
|
5720b283fa | ||
|
|
9f9c99c8c0 | ||
|
|
187e3ec909 | ||
|
|
6a81a89b3d | ||
|
|
b955fe0038 | ||
|
|
cc94afeacc | ||
|
|
f80156d9df | ||
|
|
90bba22c25 | ||
|
|
b3d88610fd | ||
|
|
650c143f6d | ||
|
|
bfecd0fd55 | ||
|
|
cf726448f2 | ||
|
|
8d244c5f7f | ||
|
|
574b80f274 | ||
|
|
403a7b4a1f | ||
|
|
294c8b747b | ||
|
|
9b1999a5ff | ||
|
|
e5db518c0f | ||
|
|
e9ee2122df | ||
|
|
251747e253 | ||
|
|
663a91bb82 | ||
|
|
77be3e3a82 | ||
|
|
0b7e3510c8 | ||
|
|
3ed113b390 | ||
|
|
3d20fd20d0 | ||
|
|
7644efae01 | ||
|
|
a9d76b06a7 | ||
|
|
cd9786c2a7 | ||
|
|
162681e60d | ||
|
|
4ed8de62ac | ||
|
|
06e962f141 | ||
|
|
fac1e3810a | ||
|
|
edd1590ac7 | ||
|
|
3f0bcbe067 | ||
|
|
3eadee6cb7 | ||
|
|
f1a22b097a | ||
|
|
544821b719 | ||
|
|
d3fa0ccabd | ||
|
|
cb12a8f0a8 | ||
|
|
5a0938d807 | ||
|
|
1177dca3da | ||
|
|
75f309fbbf | ||
|
|
c6a8ebb11f | ||
|
|
df32d2836c | ||
|
|
59f9960992 | ||
|
|
3ec6639bbb | ||
|
|
7d23598b33 | ||
|
|
78d35d4f46 | ||
|
|
5f593b0ed3 | ||
|
|
4a0d06d1db | ||
|
|
b59164cf33 | ||
|
|
b46194472f | ||
|
|
a51d0ec6ec | ||
|
|
95dbebf512 | ||
|
|
f010d773d6 | ||
|
|
2c25b0322b | ||
|
|
a5c5b54c22 | ||
|
|
6742839fd6 | ||
|
|
9ff3ecb991 | ||
|
|
9a97941e2a | ||
|
|
fc2240fb22 | ||
|
|
a4c6a3b3e1 | ||
|
|
500e625e8c | ||
|
|
5153410ced | ||
|
|
4c56b1a6dd | ||
|
|
7a0b964e8d | ||
|
|
6f3080f9fb | ||
|
|
e1d1708fa2 | ||
|
|
43f9842b6f | ||
|
|
2256b79a89 | ||
|
|
b8d9d6c326 | ||
|
|
22949911e9 | ||
|
|
3055ab0115 | ||
|
|
25e19c75c7 | ||
|
|
3c1b39c978 | ||
|
|
0db901617d | ||
|
|
569e58dcdf | ||
|
|
b1d0028e79 | ||
|
|
b88feb631e | ||
|
|
df148f48b7 | ||
|
|
7f9c68cdcb | ||
|
|
d1dcbfd0f9 | ||
|
|
c79e4a2f90 | ||
|
|
5b08e6fb16 | ||
|
|
12e4785fe8 | ||
|
|
3967bd705a | ||
|
|
fdb8995642 | ||
|
|
9d237408c6 | ||
|
|
759c938870 | ||
|
|
dc9eafcd02 | ||
|
|
e1f699bb6c | ||
|
|
db963205cc | ||
|
|
48275d8c12 | ||
|
|
f888d194da | ||
|
|
33622c409c | ||
|
|
3be0e6b087 | ||
|
|
e7fdea5953 | ||
|
|
2af39a96c5 | ||
|
|
2d3082bb55 | ||
|
|
0fe8f11090 | ||
|
|
d58d5562f1 | ||
|
|
7962cf1af8 | ||
|
|
65c60cf413 | ||
|
|
75991277fa | ||
|
|
1db1a29ffa | ||
|
|
947b37ba8e | ||
|
|
e6fd1f7875 | ||
|
|
5c7c0b2bda | ||
|
|
5f4a9f782f | ||
|
|
602a3d99ae | ||
|
|
cfdb6762e6 | ||
|
|
b1e49bab52 | ||
|
|
b75c2ce659 | ||
|
|
2601cc0fb0 | ||
|
|
0c403bfd29 | ||
|
|
78188decf9 | ||
|
|
c54cb3e63c | ||
|
|
8fc8ef1aba | ||
|
|
1b7dc1e5a5 | ||
|
|
f39c84b21f | ||
|
|
9761ffd161 | ||
|
|
aa81039b42 | ||
|
|
50f790b5d7 | ||
|
|
136fc6217c | ||
|
|
5ec9e49103 | ||
|
|
88f6286df7 | ||
|
|
f6529f932a | ||
|
|
2065d11300 | ||
|
|
35fb9bdee1 | ||
|
|
a647144616 | ||
|
|
c3c3e51f17 | ||
|
|
0b2a66db30 | ||
|
|
6e855d4b82 | ||
|
|
d4aadba9fa | ||
|
|
edc0a94a3c | ||
|
|
3a3d2165f9 | ||
|
|
9c566f7db9 | ||
|
|
29f9ef9b7f | ||
|
|
331a6a2015 | ||
|
|
b521d1d4f2 | ||
|
|
3cfb3a3683 | ||
|
|
8e2afdf568 | ||
|
|
e17eb35147 | ||
|
|
65a61ff118 | ||
|
|
71b72304ae | ||
|
|
44a6cc5eca | ||
|
|
910092ca4d | ||
|
|
cef010d5f7 | ||
|
|
648d11b8e0 | ||
|
|
fb83e97170 | ||
|
|
b0c956a178 | ||
|
|
529d7be26b | ||
|
|
726f6ad804 | ||
|
|
6cee5338b2 | ||
|
|
e061a4fa19 | ||
|
|
4fb049bcba | ||
|
|
17d4a6e900 | ||
|
|
904dababcc | ||
|
|
45dabfac1b | ||
|
|
b1713e3fcd | ||
|
|
9666834045 | ||
|
|
20ac89c4e0 | ||
|
|
bd0c6a095e | ||
|
|
7bc728bf53 | ||
|
|
828669e4e1 | ||
|
|
ccfb0ae2d3 | ||
|
|
576a80b3d9 | ||
|
|
f104f3eb2a | ||
|
|
6378205415 | ||
|
|
18cfc4be7b | ||
|
|
0ce557951f | ||
|
|
35bb44d317 | ||
|
|
aba955fa16 | ||
|
|
fd86a7dc1d | ||
|
|
264d3432ac | ||
|
|
e36fbfae5b | ||
|
|
f0a4157f89 | ||
|
|
645cf6746c | ||
|
|
031d256810 | ||
|
|
dd7e82c34f | ||
|
|
37323c57c9 | ||
|
|
acf2a82d3c | ||
|
|
85a95bf60c | ||
|
|
7c37e9aea9 | ||
|
|
b1aa8c3d8f | ||
|
|
9c77e34ef9 | ||
|
|
923cdb0552 | ||
|
|
82aab87446 | ||
|
|
fb82c4b9fa | ||
|
|
eb103e1527 | ||
|
|
43504ebd14 | ||
|
|
fb935e6e2c | ||
|
|
3ccf7ea20c | ||
|
|
2dae0a2c47 | ||
|
|
b457739f87 | ||
|
|
6d91842c83 | ||
|
|
c14dafce43 | ||
|
|
7f6f350ee1 | ||
|
|
b88806ecbf | ||
|
|
83edbb7cab | ||
|
|
bf15d6a6a2 | ||
|
|
d409898515 | ||
|
|
7a16e8e3a2 | ||
|
|
2096c6e464 | ||
|
|
7c002023d7 | ||
|
|
43552fa8d3 | ||
|
|
def014eb75 | ||
|
|
ca4d5ce037 | ||
|
|
895d5d1355 | ||
|
|
a6a71ef861 | ||
|
|
fa448806a5 | ||
|
|
f3d712724c | ||
|
|
f669531506 | ||
|
|
133b288681 | ||
|
|
dc0cb54d41 | ||
|
|
c0ac740f93 | ||
|
|
bebcb8130c | ||
|
|
971e3d83f7 | ||
|
|
238fe7d4e8 | ||
|
|
e772d1c920 | ||
|
|
bc5c4add89 | ||
|
|
e02265cfa7 | ||
|
|
314f28fb38 | ||
|
|
45ede6ba98 | ||
|
|
09dc49e942 | ||
|
|
96dbe9bcbd | ||
|
|
d212f35ae8 | ||
|
|
48fb0d1c4b | ||
|
|
2728b25783 | ||
|
|
787242d7b0 | ||
|
|
36fd007247 | ||
|
|
ad34f42467 | ||
|
|
3fd8653b40 | ||
|
|
4b18a4f026 | ||
|
|
e44532b760 | ||
|
|
fe8b12fbad | ||
|
|
433fff0006 | ||
|
|
534944b671 | ||
|
|
f6878eac36 | ||
|
|
364fdf4a56 | ||
|
|
14a399dd06 | ||
|
|
345980f78f | ||
|
|
18fe0ff14b | ||
|
|
ab4f090c63 | ||
|
|
1187ee5e16 | ||
|
|
47ac2051bb | ||
|
|
17b87725ed | ||
|
|
c9a25e931b | ||
|
|
536d59914a | ||
|
|
68b8c48c86 | ||
|
|
444fca89f8 | ||
|
|
a14053ffa0 | ||
|
|
423cd981fb | ||
|
|
d962cdbc13 | ||
|
|
9a48c1b53d | ||
|
|
201b685b13 | ||
|
|
90d6e94e5b | ||
|
|
f391e5a3a0 | ||
|
|
3a68b94487 | ||
|
|
467cb9e3d1 | ||
|
|
621bf03745 | ||
|
|
4c3ef78c05 | ||
|
|
e09a245b2b | ||
|
|
e154f4a644 | ||
|
|
7906316741 | ||
|
|
f4f8f21875 |
31
.github/ISSUE_TEMPLATE/bug_report.md
vendored
31
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@@ -4,14 +4,14 @@ about: Create a report to help us improve
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
A clear and concise description of what the bug is.
|
||||
It would be great [upgrading](https://victoriametrics.github.io/#how-to-upgrade) to [the latest avaialble release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
It would be a great [upgrading](https://docs.victoriametrics.com/#how-to-upgrade)
|
||||
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
and verifying whether the bug is reproducible there.
|
||||
It is also recommended reading [troubleshooting docs](https://victoriametrics.github.io/#troubleshooting).
|
||||
It is also recommended reading [troubleshooting docs](https://docs.victoriametrics.com/#troubleshooting).
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior.
|
||||
@@ -19,9 +19,22 @@ Steps to reproduce the behavior.
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Logs**
|
||||
Check if any warnings or errors were logged by VictoriaMetrics components
|
||||
or components in communication with VictoriaMetrics (e.g. Prometheus, Grafana).
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
For VictoriaMetrics health-state issues please provide full-length screenshots
|
||||
of Grafana dashboards if possible:
|
||||
* [Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229)
|
||||
* [Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)
|
||||
|
||||
See how to setup monitoring here:
|
||||
* [monitoring for single-node VictoriaMetrics](https://docs.victoriametrics.com/#monitoring)
|
||||
* [montioring for VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#monitoring)
|
||||
|
||||
**Version**
|
||||
The line returned when passing `--version` command line flag to binary. For example:
|
||||
```
|
||||
@@ -30,15 +43,5 @@ victoria-metrics-20190730-121249-heads-single-node-0-g671d9e55
|
||||
```
|
||||
|
||||
**Used command-line flags**
|
||||
Command-line flags are listed as `flag{name="httpListenAddr", value=":443"} 1` lines at `/metrics` page.
|
||||
See the following docs for details:
|
||||
Please provide applied command-line flags used for running VictoriaMetrics and its components.
|
||||
|
||||
* [monitoring for single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#monitoring)
|
||||
* [montioring for VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#monitoring)
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here such as error logs from VictoriaMetrics and Prometheus,
|
||||
`/metrics` output, screenshots from the official Grafana dashboards for VictoriaMetrics:
|
||||
|
||||
* [Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229)
|
||||
* [Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)
|
||||
|
||||
6
.github/dependabot.yml
vendored
Normal file
6
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
23
.github/workflows/check-licenses.yml
vendored
Normal file
23
.github/workflows/check-licenses.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
name: license-check
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'vendor'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'vendor'
|
||||
jobs:
|
||||
build:
|
||||
name: Build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@main
|
||||
with:
|
||||
go-version: 1.16
|
||||
id: go
|
||||
- name: Code checkout
|
||||
uses: actions/checkout@master
|
||||
- name: Check License
|
||||
run: |
|
||||
make check-licenses
|
||||
30
.github/workflows/github-pages.yml
vendored
30
.github/workflows/github-pages.yml
vendored
@@ -1,30 +0,0 @@
|
||||
name: github-pages
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'docs/*'
|
||||
- 'README.md'
|
||||
branches:
|
||||
- master
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@master
|
||||
- name: publish
|
||||
shell: bash
|
||||
env:
|
||||
TOKEN: ${{secrets.CI_TOKEN}}
|
||||
run: |
|
||||
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git gpages
|
||||
cp docs/* gpages
|
||||
cp README.md gpages
|
||||
cd gpages
|
||||
git config --local user.email "info@victoriametrics.com"
|
||||
git config --local user.name "Vika"
|
||||
git add .
|
||||
git commit -m "update github pages"
|
||||
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git"
|
||||
git push "${remote_repo}"
|
||||
cd ..
|
||||
rm -rf gpages
|
||||
3
.github/workflows/main.yml
vendored
3
.github/workflows/main.yml
vendored
@@ -60,8 +60,7 @@ jobs:
|
||||
GOOS=darwin go build -mod=vendor ./app/vmctl
|
||||
CGO_ENABLED=0 GOOS=windows go build -mod=vendor ./app/vmagent
|
||||
- name: Publish coverage
|
||||
uses: codecov/codecov-action@v1.0.6
|
||||
uses: codecov/codecov-action@v1.5.2
|
||||
with:
|
||||
token: ${{secrets.CODECOV_TOKEN}}
|
||||
file: ./coverage.txt
|
||||
|
||||
|
||||
2
.github/workflows/wiki.yml
vendored
2
.github/workflows/wiki.yml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
TOKEN: ${{secrets.CI_TOKEN}}
|
||||
run: |
|
||||
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git wiki
|
||||
cp docs/* wiki
|
||||
cp -r docs/* wiki
|
||||
cd wiki
|
||||
git config --local user.email "info@victoriametrics.com"
|
||||
git config --local user.name "Vika"
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -15,3 +15,4 @@
|
||||
/package/temp-rpm-*
|
||||
/package/*.deb
|
||||
/package/*.rpm
|
||||
.DS_store
|
||||
|
||||
5
.wwhrd.yml
Normal file
5
.wwhrd.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
allowlist:
|
||||
- Apache-2.0
|
||||
- MIT
|
||||
- BSD-3-Clause
|
||||
- BSD-2-Clause
|
||||
@@ -7,7 +7,7 @@ contributors and maintainers pledge to making participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
||||
level of experience, education, socio-economic status, nationality, personal
|
||||
appearance, race, religion, or sexual identity and orientation.
|
||||
appearance, race, religion or sexual identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
@@ -24,9 +24,9 @@ Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Trolling, insulting/derogatory comments and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
* Publishing others' private information, such as physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
@@ -38,26 +38,26 @@ behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
reject comments, commits, code, wiki edits, issues and other contributions
|
||||
that are not aligned to this Code of Conduct or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
threatening, offensive or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
address, posting via an official social media account or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
Instances of abusive, harassing or otherwise unacceptable behavior may be
|
||||
reported by contacting the project team at info@victoriametrics.com. All
|
||||
complaints will be reviewed and investigated and will result in a response that
|
||||
is deemed necessary and appropriate to the circumstances. The project team is
|
||||
is deemed necessary and appropriate for the circumstances. The project team is
|
||||
obligated to maintain confidentiality with regard to the reporter of an incident.
|
||||
Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
|
||||
102
Makefile
102
Makefile
@@ -1,5 +1,6 @@
|
||||
PKG_PREFIX := github.com/VictoriaMetrics/VictoriaMetrics
|
||||
|
||||
DATEINFO_TAG ?= $(shell date -u +'%Y%m%d-%H%M%S')
|
||||
BUILDINFO_TAG ?= $(shell echo $$(git describe --long --all | tr '/' '-')$$( \
|
||||
git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | openssl sha1 | cut -c 10-17)))
|
||||
|
||||
@@ -8,7 +9,7 @@ ifeq ($(PKG_TAG),)
|
||||
PKG_TAG := $(BUILDINFO_TAG)
|
||||
endif
|
||||
|
||||
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)'
|
||||
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(DATEINFO_TAG)-$(BUILDINFO_TAG)'
|
||||
|
||||
.PHONY: $(MAKECMDGOALS)
|
||||
|
||||
@@ -53,6 +54,14 @@ vmutils: \
|
||||
vmrestore \
|
||||
vmctl
|
||||
|
||||
vmutils-pure: \
|
||||
vmagent-pure \
|
||||
vmalert-pure \
|
||||
vmauth-pure \
|
||||
vmbackup-pure \
|
||||
vmrestore-pure \
|
||||
vmctl-pure
|
||||
|
||||
vmutils-arm64: \
|
||||
vmagent-arm64 \
|
||||
vmalert-arm64 \
|
||||
@@ -61,6 +70,20 @@ vmutils-arm64: \
|
||||
vmrestore-arm64 \
|
||||
vmctl-arm64
|
||||
|
||||
vmutils-arm: \
|
||||
vmagent-arm \
|
||||
vmalert-arm \
|
||||
vmauth-arm \
|
||||
vmbackup-arm \
|
||||
vmrestore-arm \
|
||||
vmctl-arm
|
||||
|
||||
vmutils-windows-amd64: \
|
||||
vmagent-windows-amd64 \
|
||||
vmalert-windows-amd64 \
|
||||
vmauth-windows-amd64 \
|
||||
vmctl-windows-amd64
|
||||
|
||||
release-snap:
|
||||
snapcraft
|
||||
snapcraft upload "victoriametrics_$(PKG_TAG)_multi.snap" --release beta,edge,candidate
|
||||
@@ -71,11 +94,15 @@ release: \
|
||||
|
||||
release-victoria-metrics: \
|
||||
release-victoria-metrics-amd64 \
|
||||
release-victoria-metrics-arm \
|
||||
release-victoria-metrics-arm64
|
||||
|
||||
release-victoria-metrics-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-arm:
|
||||
GOARCH=arm $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
release-victoria-metrics-arm64:
|
||||
GOARCH=arm64 $(MAKE) release-victoria-metrics-generic
|
||||
|
||||
@@ -85,11 +112,13 @@ release-victoria-metrics-generic: victoria-metrics-$(GOARCH)-prod
|
||||
victoria-metrics-$(GOARCH)-prod \
|
||||
&& sha256sum victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
victoria-metrics-$(GOARCH)-prod \
|
||||
| sed s/-$(GOARCH)// > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
| sed s/-$(GOARCH)-prod/-prod/ > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
|
||||
release-vmutils: \
|
||||
release-vmutils-amd64 \
|
||||
release-vmutils-arm64
|
||||
release-vmutils-arm64 \
|
||||
release-vmutils-arm \
|
||||
release-vmutils-windows-amd64
|
||||
|
||||
release-vmutils-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-vmutils-generic
|
||||
@@ -97,6 +126,12 @@ release-vmutils-amd64:
|
||||
release-vmutils-arm64:
|
||||
GOARCH=arm64 $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-arm:
|
||||
GOARCH=arm $(MAKE) release-vmutils-generic
|
||||
|
||||
release-vmutils-windows-amd64:
|
||||
GOARCH=amd64 $(MAKE) release-vmutils-windows-generic
|
||||
|
||||
release-vmutils-generic: \
|
||||
vmagent-$(GOARCH)-prod \
|
||||
vmalert-$(GOARCH)-prod \
|
||||
@@ -119,7 +154,25 @@ release-vmutils-generic: \
|
||||
vmbackup-$(GOARCH)-prod \
|
||||
vmrestore-$(GOARCH)-prod \
|
||||
vmctl-$(GOARCH)-prod \
|
||||
| sed s/-$(GOARCH)// > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
| sed s/-$(GOARCH)-prod/-prod/ > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
|
||||
release-vmutils-windows-generic: \
|
||||
vmagent-windows-$(GOARCH)-prod \
|
||||
vmalert-windows-$(GOARCH)-prod \
|
||||
vmauth-windows-$(GOARCH)-prod \
|
||||
vmctl-windows-$(GOARCH)-prod
|
||||
cd bin && \
|
||||
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmctl-windows-$(GOARCH)-prod.exe \
|
||||
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmctl-windows-$(GOARCH)-prod.exe \
|
||||
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
|
||||
pprof-cpu:
|
||||
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
|
||||
@@ -137,7 +190,7 @@ lint: install-golint
|
||||
golint app/...
|
||||
|
||||
install-golint:
|
||||
which golint || go install golang.org/x/lint/golint
|
||||
which golint || GO111MODULE=off go get golang.org/x/lint/golint
|
||||
|
||||
errcheck: install-errcheck
|
||||
errcheck -exclude=errcheck_excludes.txt ./lib/...
|
||||
@@ -152,7 +205,7 @@ errcheck: install-errcheck
|
||||
errcheck -exclude=errcheck_excludes.txt ./app/vmctl/...
|
||||
|
||||
install-errcheck:
|
||||
which errcheck || go install github.com/kisielk/errcheck
|
||||
which errcheck || GO111MODULE=off go get github.com/kisielk/errcheck
|
||||
|
||||
check-all: fmt vet lint errcheck golangci-lint
|
||||
|
||||
@@ -194,24 +247,43 @@ app-local-pure:
|
||||
app-local-with-goarch:
|
||||
GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-$(GOARCH)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
|
||||
|
||||
app-local-windows-with-goarch:
|
||||
CGO_ENABLED=0 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-windows-$(GOARCH)$(RACE).exe $(PKG_PREFIX)/app/$(APP_NAME)
|
||||
|
||||
quicktemplate-gen: install-qtc
|
||||
qtc
|
||||
|
||||
install-qtc:
|
||||
which qtc || go install github.com/valyala/quicktemplate/qtc
|
||||
which qtc || GO111MODULE=off go get github.com/valyala/quicktemplate/qtc
|
||||
|
||||
|
||||
golangci-lint: install-golangci-lint
|
||||
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
|
||||
|
||||
install-golangci-lint:
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.29.0
|
||||
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.40.1
|
||||
|
||||
install-wwhrd:
|
||||
which wwhrd || GO111MODULE=off go get github.com/frapposelli/wwhrd
|
||||
|
||||
check-licenses: install-wwhrd
|
||||
wwhrd check -f .wwhrd.yml
|
||||
|
||||
copy-docs:
|
||||
echo "---\nsort: ${ORDER}\n---\n" > ${DST}
|
||||
cat ${SRC} >> ${DST}
|
||||
|
||||
# Copies docs for all components and adds the order tag.
|
||||
# Cluster docs are supposed to be ordered as 9th.
|
||||
# For The rest of docs is ordered manually.t
|
||||
docs-sync:
|
||||
cp app/vmagent/README.md docs/vmagent.md
|
||||
cp app/vmalert/README.md docs/vmalert.md
|
||||
cp app/vmauth/README.md docs/vmauth.md
|
||||
cp app/vmbackup/README.md docs/vmbackup.md
|
||||
cp app/vmrestore/README.md docs/vmrestore.md
|
||||
cp app/vmctl/README.md docs/vmctl.md
|
||||
cp README.md docs/Single-server-VictoriaMetrics.md
|
||||
cp README.md docs/README.md
|
||||
SRC=README.md DST=docs/Single-server-VictoriaMetrics.md ORDER=1 $(MAKE) copy-docs
|
||||
SRC=app/vmagent/README.md DST=docs/vmagent.md ORDER=3 $(MAKE) copy-docs
|
||||
SRC=app/vmalert/README.md DST=docs/vmalert.md ORDER=4 $(MAKE) copy-docs
|
||||
SRC=app/vmauth/README.md DST=docs/vmauth.md ORDER=5 $(MAKE) copy-docs
|
||||
SRC=app/vmbackup/README.md DST=docs/vmbackup.md ORDER=6 $(MAKE) copy-docs
|
||||
SRC=app/vmrestore/README.md DST=docs/vmrestore.md ORDER=7 $(MAKE) copy-docs
|
||||
SRC=app/vmctl/README.md DST=docs/vmctl.md ORDER=8 $(MAKE) copy-docs
|
||||
SRC=app/vmgateway/README.md DST=docs/vmgateway.md ORDER=9 $(MAKE) copy-docs
|
||||
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md ORDER=10 $(MAKE) copy-docs
|
||||
|
||||
552
README.md
552
README.md
@@ -1,14 +1,14 @@
|
||||
# VictoriaMetrics
|
||||
|
||||
[](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
|
||||
[](https://hub.docker.com/r/victoriametrics/victoria-metrics)
|
||||
[](http://slack.victoriametrics.com/)
|
||||
[](https://slack.victoriametrics.com/)
|
||||
[](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE)
|
||||
[](https://goreportcard.com/report/github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
[](https://github.com/VictoriaMetrics/VictoriaMetrics/actions)
|
||||
[](https://codecov.io/gh/VictoriaMetrics/VictoriaMetrics)
|
||||
|
||||

|
||||
|
||||
## VictoriaMetrics
|
||||
<img src="logo.png" width="300" alt="VictoriaMetrics logo">
|
||||
|
||||
VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
|
||||
|
||||
@@ -28,30 +28,35 @@ See [features available for enterprise customers](https://victoriametrics.com/en
|
||||
|
||||
## Case studies and talks
|
||||
|
||||
Alphabetically sorted links to case studies:
|
||||
Case studies:
|
||||
|
||||
* [adidas](https://victoriametrics.github.io/CaseStudies.html#adidas)
|
||||
* [Adsterra](https://victoriametrics.github.io/CaseStudies.html#adsterra)
|
||||
* [ARNES](https://victoriametrics.github.io/CaseStudies.html#arnes)
|
||||
* [Brandwatch](https://victoriametrics.github.io/CaseStudies.html#brandwatch)
|
||||
* [CERN](https://victoriametrics.github.io/CaseStudies.html#cern)
|
||||
* [COLOPL](https://victoriametrics.github.io/CaseStudies.html#colopl)
|
||||
* [Dreamteam](https://victoriametrics.github.io/CaseStudies.html#dreamteam)
|
||||
* [Idealo.de](https://victoriametrics.github.io/CaseStudies.html#idealode)
|
||||
* [MHI Vestas Offshore Wind](https://victoriametrics.github.io/CaseStudies.html#mhi-vestas-offshore-wind)
|
||||
* [Synthesio](https://victoriametrics.github.io/CaseStudies.html#synthesio)
|
||||
* [Wedos.com](https://victoriametrics.github.io/CaseStudies.html#wedoscom)
|
||||
* [Wix.com](https://victoriametrics.github.io/CaseStudies.html#wixcom)
|
||||
* [Zerodha](https://victoriametrics.github.io/CaseStudies.html#zerodha)
|
||||
* [zhihu](https://victoriametrics.github.io/CaseStudies.html#zhihu)
|
||||
* [adidas](https://docs.victoriametrics.com/CaseStudies.html#adidas)
|
||||
* [Adsterra](https://docs.victoriametrics.com/CaseStudies.html#adsterra)
|
||||
* [ARNES](https://docs.victoriametrics.com/CaseStudies.html#arnes)
|
||||
* [Brandwatch](https://docs.victoriametrics.com/CaseStudies.html#brandwatch)
|
||||
* [CERN](https://docs.victoriametrics.com/CaseStudies.html#cern)
|
||||
* [COLOPL](https://docs.victoriametrics.com/CaseStudies.html#colopl)
|
||||
* [Dreamteam](https://docs.victoriametrics.com/CaseStudies.html#dreamteam)
|
||||
* [German Research Center for Artificial Intelligence](https://docs.victoriametrics.com/CaseStudies.html#german-research-center-for-artificial-intelligence)
|
||||
* [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x)
|
||||
* [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode)
|
||||
* [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind)
|
||||
* [Sensedia](https://docs.victoriametrics.com/CaseStudies.html#sensedia)
|
||||
* [Synthesio](https://docs.victoriametrics.com/CaseStudies.html#synthesio)
|
||||
* [Wedos.com](https://docs.victoriametrics.com/CaseStudies.html#wedoscom)
|
||||
* [Wix.com](https://docs.victoriametrics.com/CaseStudies.html#wixcom)
|
||||
* [Zerodha](https://docs.victoriametrics.com/CaseStudies.html#zerodha)
|
||||
* [zhihu](https://docs.victoriametrics.com/CaseStudies.html#zhihu)
|
||||
|
||||
See also [articles and slides about VictoriaMetrics from our users](https://docs.victoriametrics.com/Articles.html#third-party-articles-and-slides-about-victoriametrics)
|
||||
|
||||
|
||||
## Prominent features
|
||||
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://victoriametrics.github.io/vmagent.html).
|
||||
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://docs.victoriametrics.com/vmagent.html).
|
||||
See [these docs](#prometheus-setup) for details.
|
||||
* VictoriaMetrics supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
|
||||
* VictoriaMetrics implements [MetricsQL](https://victoriametrics.github.io/MetricsQL.html) query language backwards compatible with PromQL.
|
||||
* VictoriaMetrics implements [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) query language backwards compatible with PromQL.
|
||||
* VictoriaMetrics provides global query view. Multiple Prometheus instances or any other data sources may ingest data into VictoriaMetrics.
|
||||
Later this data may be queried via a single query.
|
||||
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
|
||||
@@ -76,7 +81,7 @@ Alphabetically sorted links to case studies:
|
||||
* All the configuration is done via explicit command-line flags with reasonable defaults.
|
||||
* All the data is stored in a single directory pointed by `-storageDataPath` command-line flag.
|
||||
* Easy and fast backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
|
||||
to S3 or GCS with [vmbackup](https://victoriametrics.github.io/vmbackup.html) / [vmrestore](https://victoriametrics.github.io/vmrestore.html).
|
||||
to S3 or GCS with [vmbackup](https://docs.victoriametrics.com/vmbackup.html) / [vmrestore](https://docs.victoriametrics.com/vmrestore.html).
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
* Storage is protected from corruption on unclean shutdown (i.e. OOM, hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
* Supports metrics' scraping, ingestion and [backfilling](#backfilling) via the following protocols:
|
||||
@@ -93,9 +98,10 @@ Alphabetically sorted links to case studies:
|
||||
* [Prometheus exposition format](#how-to-import-data-in-prometheus-exposition-format).
|
||||
* [Arbitrary CSV data](#how-to-import-csv-data).
|
||||
* Supports metrics' relabeling. See [these docs](#relabeling) for details.
|
||||
* Ideally works with big amounts of time series data from Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads.
|
||||
* Can deal with high cardinality and high churn rate issues using [series limiter](#cardinality-limiter).
|
||||
* Ideally works with big amounts of time series data from APM, Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads.
|
||||
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
* See also technical [Articles about VictoriaMetrics](https://victoriametrics.github.io/Articles.html).
|
||||
* See also technical [Articles about VictoriaMetrics](https://docs.victoriametrics.com/Articles.html).
|
||||
|
||||
|
||||
## Operation
|
||||
@@ -117,6 +123,7 @@ Alphabetically sorted links to case studies:
|
||||
* [Prometheus querying API usage](#prometheus-querying-api-usage)
|
||||
* [Prometheus querying API enhancements](#prometheus-querying-api-enhancements)
|
||||
* [Graphite API usage](#graphite-api-usage)
|
||||
* [Graphite Render API usage](#graphite-render-api-usage)
|
||||
* [Graphite Metrics API usage](#graphite-metrics-api-usage)
|
||||
* [Graphite Tags API usage](#graphite-tags-api-usage)
|
||||
* [How to build from sources](#how-to-build-from-sources)
|
||||
@@ -153,6 +160,8 @@ Alphabetically sorted links to case studies:
|
||||
* [Security](#security)
|
||||
* [Tuning](#tuning)
|
||||
* [Monitoring](#monitoring)
|
||||
* [TSDB stats](#tsdb-stats)
|
||||
* [Cardinality limiter](#cardinality-limiter)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Data migration](#data-migration)
|
||||
* [Backfilling](#backfilling)
|
||||
@@ -165,11 +174,12 @@ Alphabetically sorted links to case studies:
|
||||
* [Contacts](#contacts)
|
||||
* [Community and contributions](#community-and-contributions)
|
||||
* [Reporting bugs](#reporting-bugs)
|
||||
* [Victoria Metrics Logo](#victoria-metrics-logo)
|
||||
* [VictoriaMetrics Logo](#victoria-metrics-logo)
|
||||
* [Logo Usage Guidelines](#logo-usage-guidelines)
|
||||
* [Font used](#font-used)
|
||||
* [Color Palette](#color-palette)
|
||||
* [We kindly ask](#we-kindly-ask)
|
||||
* [List of command-line flags](#list-of-command-line-flags)
|
||||
|
||||
|
||||
## How to start VictoriaMetrics
|
||||
@@ -182,7 +192,7 @@ The following command-line flags are used the most:
|
||||
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in the current working directory.
|
||||
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month. See [these docs](#retention) for more details.
|
||||
|
||||
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see all the available flags with description and default values.
|
||||
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see [all the available flags with description and default values](#list-of-command-line-flags).
|
||||
|
||||
See how to [ingest data to VictoriaMetrics](#how-to-import-time-series-data), how to [query VictoriaMetrics](#grafana-setup)
|
||||
and how to [handle alerts](#alerting).
|
||||
@@ -272,8 +282,8 @@ Read more about tuning remote write for Prometheus [here](https://prometheus.io/
|
||||
|
||||
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer, since previous versions may have issues with `remote_write`.
|
||||
|
||||
Take a look also at [vmagent](https://victoriametrics.github.io/vmagent.html)
|
||||
and [vmalert](https://victoriametrics.github.io/vmalert.html),
|
||||
Take a look also at [vmagent](https://docs.victoriametrics.com/vmagent.html)
|
||||
and [vmalert](https://docs.victoriametrics.com/vmalert.html),
|
||||
which can be used as faster and less resource-hungry alternative to Prometheus.
|
||||
|
||||
|
||||
@@ -288,7 +298,7 @@ http://<victoriametrics-addr>:8428
|
||||
Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
|
||||
|
||||
Then build graphs with the created datasource using [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/)
|
||||
or [MetricsQL](https://victoriametrics.github.io/MetricsQL.html). VictoriaMetrics supports [Prometheus querying API](#prometheus-querying-api-usage),
|
||||
or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html). VictoriaMetrics supports [Prometheus querying API](#prometheus-querying-api-usage),
|
||||
which is used by Grafana.
|
||||
|
||||
|
||||
@@ -336,8 +346,11 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la
|
||||
* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
|
||||
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
|
||||
* [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config)
|
||||
* [docker_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config)
|
||||
* [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config)
|
||||
* [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config)
|
||||
* [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)
|
||||
* [http_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config)
|
||||
|
||||
|
||||
Other `*_sd_config` types will be supported in the future.
|
||||
@@ -346,7 +359,7 @@ The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders,
|
||||
|
||||
VictoriaMetrics also supports [importing data in Prometheus exposition format](#how-to-import-data-in-prometheus-exposition-format).
|
||||
|
||||
See also [vmagent](https://victoriametrics.github.io/vmagent.html), which can be used as drop-in replacement for Prometheus.
|
||||
See also [vmagent](https://docs.victoriametrics.com/vmagent.html), which can be used as drop-in replacement for Prometheus.
|
||||
|
||||
|
||||
## How to send data from InfluxDB-compatible agents such as [Telegraf](https://www.influxdata.com/time-series-platform/telegraf/)
|
||||
@@ -413,6 +426,10 @@ while VictoriaMetrics stores them with *milliseconds* precision.
|
||||
Extra labels may be added to all the written time series by passing `extra_label=name=value` query args.
|
||||
For example, `/write?extra_label=foo=bar` would add `{foo="bar"}` label to all the ingested metrics.
|
||||
|
||||
Some plugins for Telegraf such as [fluentd](https://github.com/fangli/fluent-plugin-influxdb), [Juniper/open-nti](https://github.com/Juniper/open-nti)
|
||||
or [Juniper/jitmon](https://github.com/Juniper/jtimon) send `SHOW DATABASES` query to `/query` and expect a particular database name in the response.
|
||||
Comma-separated list of expected databases can be passed to VictoriaMetrics via `-influx.databaseNames` command-line flag.
|
||||
|
||||
## How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)
|
||||
|
||||
Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
|
||||
@@ -450,11 +467,7 @@ The `/api/v1/export` endpoint should return the following response:
|
||||
Data sent to VictoriaMetrics via `Graphite plaintext protocol` may be read via the following APIs:
|
||||
|
||||
* [Graphite API](#graphite-api-usage)
|
||||
* [Prometheus querying API](#prometheus-querying-api-usage). Graphite metric names may special chars such as `-`, which may clash
|
||||
with [MetricsQL operations](https://victoriametrics.github.io/MetricsQL.html). Such metrics can be queries via `{__name__="foo-bar.baz"}`.
|
||||
VictoriaMetrics supports `__graphite__` pseudo-label for selecting time series with Graphite-compatible filters in [MetricsQL](https://victoriametrics.github.io/MetricsQL.html).
|
||||
For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster
|
||||
and it is easier to use when migrating from Graphite to VictoriaMetrics.
|
||||
* [Prometheus querying API](#prometheus-querying-api-usage). VictoriaMetrics supports `__graphite__` pseudo-label for selecting time series with Graphite-compatible filters in [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html). For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster and it is easier to use when migrating from Graphite to VictoriaMetrics.
|
||||
* [go-graphite/carbonapi](https://github.com/go-graphite/carbonapi/blob/main/cmd/carbonapi/carbonapi.example.victoriametrics.yaml)
|
||||
|
||||
## How to send data from OpenTSDB-compatible agents
|
||||
@@ -543,9 +556,7 @@ VictoriaMetrics supports the following handlers from [Prometheus querying API](h
|
||||
* [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
|
||||
* [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
|
||||
* [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
|
||||
* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). VictoriaMetrics accepts optional `topN=N` and `date=YYYY-MM-DD`
|
||||
query args for this handler, where `N` is the number of top entries to return in the response and `YYYY-MM-DD` is the date for collecting the stats.
|
||||
By default top 10 entries are returned and the stats is collected for the current day.
|
||||
* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). See [these docs](#tsdb-stats) for details.
|
||||
* [/api/v1/targets](https://prometheus.io/docs/prometheus/latest/querying/api/#targets) - see [these docs](#how-to-scrape-prometheus-exporters-such-as-node-exporter) for more details.
|
||||
|
||||
These handlers can be queried from Prometheus-compatible clients such as Grafana or curl.
|
||||
@@ -557,21 +568,18 @@ All the Prometheus querying API handlers can be prepended with `/prometheus` pre
|
||||
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg, which can be used for enforcing additional label filters for queries. For example,
|
||||
`/api/v1/query_range?extra_label=user_id=123&query=<query>` would automatically add `{user_id="123"}` label filter to the given `<query>`. This functionality can be used
|
||||
for limiting the scope of time series visible to the given tenant. It is expected that the `extra_label` query arg is automatically set by auth proxy sitting
|
||||
in front of VictoriaMetrics. [Contact us](mailto:sales@victoriametrics.com) if you need assistance with such a proxy.
|
||||
in front of VictoriaMetrics. See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
|
||||
|
||||
VictoriaMetrics accepts relative times in `time`, `start` and `end` query args additionally to unix timestamps and [RFC3339](https://www.ietf.org/rfc/rfc3339.txt).
|
||||
For example, the following query would return data for the last 30 minutes: `/api/v1/query_range?start=-30m&query=...`.
|
||||
|
||||
VictoriaMetrics accepts `round_digits` query arg for `/api/v1/query` and `/api/v1/query_range` handlers. It can be used for rounding response values to the given number of digits after the decimal point. For example, `/api/v1/query?query=avg_over_time(temperature[1h])&round_digits=2` would round response values to up to two digits after the decimal point.
|
||||
|
||||
By default, VictoriaMetrics returns time series for the last 5 minutes from `/api/v1/series`, while the Prometheus API defaults to all time. Use `start` and `end` to select a different time range.
|
||||
|
||||
VictoriaMetrics accepts additional args for `/api/v1/labels` and `/api/v1/label/.../values` handlers.
|
||||
See [this feature request](https://github.com/prometheus/prometheus/issues/6178) for details:
|
||||
|
||||
* Any number [time series selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) via `match[]` query arg.
|
||||
* Optional `start` and `end` query args for limiting the time range for the selected labels or label values.
|
||||
|
||||
Additionally VictoriaMetrics provides the following handlers:
|
||||
|
||||
* `/vmui` - Basic Web UI
|
||||
* `/api/v1/series/count` - returns the total number of time series in the database. Some notes:
|
||||
* the handler scans all the inverted index, so it can be slow if the database contains tens of millions of time series;
|
||||
* the handler may count [deleted time series](#how-to-delete-time-series) additionally to normal time series due to internal implementation restrictions;
|
||||
@@ -597,7 +605,11 @@ VictoriaMetrics supports the following Graphite APIs, which are needed for [Grap
|
||||
|
||||
All the Graphite handlers can be pre-pended with `/graphite` prefix. For example, both `/graphite/metrics/find` and `/metrics/find` should work.
|
||||
|
||||
VictoriaMetrics supports `__graphite__` pseudo-label for filtering time series with Graphite-compatible filters in [MetricsQL](https://victoriametrics.github.io/MetricsQL.html).
|
||||
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg for all the Graphite APIs. This arg can be used for limiting the scope of time series
|
||||
visible to the given tenant. It is expected that the `extra_label` query arg is automatically set by auth proxy sitting in front of VictoriaMetrics.
|
||||
[Contact us](mailto:sales@victoriametrics.com) if you need assistance with such a proxy.
|
||||
|
||||
VictoriaMetrics supports `__graphite__` pseudo-label for filtering time series with Graphite-compatible filters in [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
|
||||
For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster
|
||||
and it is easier to use when migrating from Graphite to VictoriaMetrics.
|
||||
|
||||
@@ -646,14 +658,14 @@ to your needs or when testing bugfixes.
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make victoria-metrics` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics` binary and puts it into the `bin` folder.
|
||||
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make victoria-metrics-prod` from the root folder of the repository.
|
||||
2. Run `make victoria-metrics-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
### ARM build
|
||||
@@ -662,24 +674,22 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics-arm` or `make victoria-metrics-arm64` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make victoria-metrics-arm` or `make victoria-metrics-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-arm` or `victoria-metrics-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
### Production ARM build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make victoria-metrics-arm-prod` or `make victoria-metrics-arm64-prod` from the root folder of the repository.
|
||||
2. Run `make victoria-metrics-arm-prod` or `make victoria-metrics-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-arm-prod` or `victoria-metrics-arm64-prod` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
### Pure Go build (CGO_ENABLED=0)
|
||||
|
||||
`Pure Go` mode builds only Go code without [cgo](https://golang.org/cmd/cgo/) dependencies.
|
||||
This is an experimental mode, which may result in a lower compression ratio and slower decompression performance.
|
||||
Use it with caution!
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make victoria-metrics-pure` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make victoria-metrics-pure` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `victoria-metrics-pure` binary and puts it into the `bin` folder.
|
||||
|
||||
### Building docker images
|
||||
@@ -699,7 +709,7 @@ ROOT_IMAGE=scratch make package-victoria-metrics
|
||||
## Start with docker-compose
|
||||
|
||||
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://victoriametrics.github.io/vmagent.html) and Grafana with one command.
|
||||
helps to spin up VictoriaMetrics, [vmagent](https://docs.victoriametrics.com/vmagent.html) and Grafana with one command.
|
||||
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
|
||||
|
||||
|
||||
@@ -722,7 +732,7 @@ The page will return the following JSON response:
|
||||
|
||||
Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-storageDataPath>`
|
||||
is the command-line flag value. Snapshots can be archived to backup storage at any time
|
||||
with [vmbackup](https://victoriametrics.github.io/vmbackup.html).
|
||||
with [vmbackup](https://docs.victoriametrics.com/vmbackup.html).
|
||||
|
||||
The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
|
||||
|
||||
@@ -734,7 +744,7 @@ Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete_all` in order to
|
||||
Steps for restoring from a snapshot:
|
||||
|
||||
1. Stop VictoriaMetrics with `kill -INT`.
|
||||
2. Restore snapshot contents from backup with [vmrestore](https://victoriametrics.github.io/vmrestore.html)
|
||||
2. Restore snapshot contents from backup with [vmrestore](https://docs.victoriametrics.com/vmrestore.html)
|
||||
to the directory pointed by `-storageDataPath`.
|
||||
3. Start VictoriaMetrics.
|
||||
|
||||
@@ -814,6 +824,8 @@ Optional `start` and `end` args may be added to the request in order to limit th
|
||||
unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values.
|
||||
|
||||
The exported data can be imported to VictoriaMetrics via [/api/v1/import/native](#how-to-import-data-in-native-format).
|
||||
The native export format may change in incompatible way between VictoriaMetrics releases, so the data exported from the release X
|
||||
can fail to be imported into VictoriaMetrics release Y.
|
||||
|
||||
|
||||
### How to export data in JSON line format
|
||||
@@ -946,6 +958,8 @@ For example, `/api/v1/import?extra_label=foo=bar` would add `"foo":"bar"` label
|
||||
|
||||
Note that it could be required to flush response cache after importing historical data. See [these docs](#backfilling) for detail.
|
||||
|
||||
VictoriaMetrics parses input JSON lines one-by-one. It loads the whole JSON line in memory, then parses it and then saves the parsed samples into persistent storage. This means that VictoriaMetrics can occupy big amounts of RAM when importing too long JSON lines. The solution is to split too long JSON lines into smaller lines. It is OK if samples for a single time series are split among multiple JSON lines.
|
||||
|
||||
|
||||
### How to import CSV data
|
||||
|
||||
@@ -1061,7 +1075,7 @@ VictoriaMetrics provides the following extra actions for relabeling rules:
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
See also [relabeling in vmagent](https://victoriametrics.github.io/vmagent.html#relabeling).
|
||||
See also [relabeling in vmagent](https://docs.victoriametrics.com/vmagent.html#relabeling).
|
||||
|
||||
|
||||
## Federation
|
||||
@@ -1075,51 +1089,34 @@ on the interval `[now - max_lookback ... now]` is scraped for each time series.
|
||||
For instance, `/federate?match[]=up&max_lookback=1h` would return last points on the `[now - 1h ... now]` interval. This may be useful for time series federation
|
||||
with scrape intervals exceeding `5m`.
|
||||
|
||||
|
||||
## Capacity planning
|
||||
|
||||
A rough estimation of the required resources for ingestion path:
|
||||
VictoriaMetrics uses lower amounts of CPU, RAM and storage space on production workloads compared to competing solutions (Prometheus, Thanos, Cortex, TimescaleDB, InfluxDB, QuestDB, M3DB) according to [our case studies](https://docs.victoriametrics.com/CaseStudies.html).
|
||||
|
||||
* RAM size: less than 1KB per active time series. So, ~1GB of RAM is required for 1M active time series.
|
||||
Time series is considered active if new data points have been added to it recently or if it has been recently queried.
|
||||
The number of active time series may be obtained from `vm_cache_entries{type="storage/hour_metric_ids"}` metric
|
||||
exported on the `/metrics` page.
|
||||
VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with `-memory.allowedPercent` or `-memory.allowedBytes` flags.
|
||||
VictoriaMetrics capacity scales linearly with the available resources. The needed amounts of CPU and RAM highly depends on the workload - the number of active time series, series churn rate, query types, query qps, etc. It is recommended setting up a test VictoriaMetrics for your production workload and iteratively scaling CPU and RAM resources until it becomes stable according to [troubleshooting docs](#troubleshooting). A single-node VictoriaMetrics works perfectly with the following production workload according to [our case studies](https://docs.victoriametrics.com/CaseStudies.html):
|
||||
|
||||
* CPU cores: a CPU core per 300K inserted data points per second. So, ~4 CPU cores are required for processing
|
||||
the insert stream of 1M data points per second. The ingestion rate may be lower for high cardinality data or for time series with high number of labels.
|
||||
See [this article](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) for details.
|
||||
If you see lower numbers per CPU core, then it is likely active time series info doesn't fit caches,
|
||||
so you need more RAM for lowering CPU usage.
|
||||
* Ingestion rate: 1.5+ million samples per second
|
||||
* Active time series: 50+ million
|
||||
* Total time series: 5+ billion
|
||||
* Time series churn rate: 150+ million of new series per day
|
||||
* Total number of samples: 10+ trillion
|
||||
* Queries: 200+ qps
|
||||
* Query latency (99th percentile): 1 second
|
||||
|
||||
* Storage space: less than a byte per data point on average. So, ~260GB is required for storing a month-long insert stream
|
||||
of 100K data points per second.
|
||||
The actual storage size heavily depends on data randomness (entropy). Higher randomness means higher storage size requirements.
|
||||
Read [this article](https://medium.com/faun/victoriametrics-achieving-better-compression-for-time-series-data-than-gorilla-317bc1f95932)
|
||||
for details.
|
||||
The needed storage space for the given retention (the retention is set via `-retentionPeriod` command-line flag) can be extrapolated from disk space usage in a test run. For example, if `-storageDataPath` directory size becomes 10GB after a day-long test run on a production workload, then it will need at least `10GB*100=1TB` of disk space for `-retentionPeriod=100d` (100-days retention period).
|
||||
|
||||
* Network usage: outbound traffic is negligible. Ingress traffic is ~100 bytes per ingested data point via
|
||||
[Prometheus remote_write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write).
|
||||
The actual ingress bandwidth usage depends on the average number of labels per ingested metric and the average size
|
||||
of label values. The higher number of per-metric labels and longer label values mean the higher ingress bandwidth.
|
||||
It is recommended leaving the following amounts of spare resources:
|
||||
|
||||
The required resources for query path:
|
||||
|
||||
* RAM size: depends on the number of time series to scan in each query and the `step`
|
||||
argument passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
|
||||
The higher number of scanned time series and lower `step` argument results in the higher RAM usage.
|
||||
|
||||
* CPU cores: a CPU core per 30 millions of scanned data points per second.
|
||||
This means that heavy queries that touch big number of time series (over 10K) and/or big number data points (over 100M)
|
||||
usually require more CPU resources than tiny queries that touch a few time series with small number of data points.
|
||||
|
||||
* Network usage: depends on the frequency and the type of incoming requests. Typical Grafana dashboards usually
|
||||
require negligible network bandwidth.
|
||||
* 50% of free RAM for reducing the probability of OOM (out of memory) crashes and slowdowns during temporary spikes in workload.
|
||||
* 50% of spare CPU for reducing the probability of slowdowns during temporary spikes in workload.
|
||||
* At least 30% of free storage space at the directory pointed by `-storageDataPath` command-line flag.
|
||||
|
||||
|
||||
## High availability
|
||||
|
||||
* Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
|
||||
* Pass addresses of these instances to [vmagent](https://victoriametrics.github.io/vmagent.html) via `-remoteWrite.url` command-line flag:
|
||||
* Pass addresses of these instances to [vmagent](https://docs.victoriametrics.com/vmagent.html) via `-remoteWrite.url` command-line flag:
|
||||
|
||||
```bash
|
||||
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
|
||||
@@ -1144,7 +1141,7 @@ remote_write:
|
||||
kill -HUP `pidof prometheus`
|
||||
```
|
||||
|
||||
It is recommended to use [vmagent](https://victoriametrics.github.io/vmagent.html) instead of Prometheus for highly loaded setups.
|
||||
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups.
|
||||
|
||||
* Now Prometheus should write data into all the configured `remote_write` urls in parallel.
|
||||
* Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
|
||||
@@ -1163,10 +1160,10 @@ VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-
|
||||
is set to positive duration. For example, `-dedup.minScrapeInterval=60s` would de-duplicate data points
|
||||
on the same time series if they fall within the same discrete 60s bucket. The earliest data point will be kept. In the case of equal timestamps, an arbitrary data point will be kept.
|
||||
|
||||
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs.
|
||||
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs. It is recommended to have a single `scrape_interval` across all the scrape targets. See [this article](https://www.robustperception.io/keep-it-simple-scrape_interval-id) for details.
|
||||
|
||||
The de-duplication reduces disk space usage if multiple identically configured Prometheus instances in HA pair
|
||||
write data to the same VictoriaMetrics instance. Note that these Prometheus instances must have identical
|
||||
The de-duplication reduces disk space usage if multiple identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) or Prometheus instances in HA pair
|
||||
write data to the same VictoriaMetrics instance. These vmagent or Prometheus instances must have identical
|
||||
`external_labels` section in their configs, so they write data to the same time series.
|
||||
|
||||
|
||||
@@ -1193,9 +1190,9 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
|
||||
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
|
||||
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
|
||||
|
||||
Then set up [vmauth](https://victoriametrics.github.io/vmauth.html) in front of VictoriaMetrics instances,
|
||||
Then set up [vmauth](https://docs.victoriametrics.com/vmauth.html) in front of VictoriaMetrics instances,
|
||||
so it could route requests from particular user to VictoriaMetrics with the desired retention.
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html).
|
||||
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
|
||||
|
||||
|
||||
## Downsampling
|
||||
@@ -1233,7 +1230,7 @@ horizontally scalable long-term remote storage for really large Prometheus deplo
|
||||
|
||||
## Alerting
|
||||
|
||||
It is recommended using [vmalert](https://victoriametrics.github.io/vmalert.html) for alerting.
|
||||
It is recommended using [vmalert](https://docs.victoriametrics.com/vmalert.html) for alerting.
|
||||
|
||||
Additionally, alerting can be set up with the following tools:
|
||||
|
||||
@@ -1258,7 +1255,7 @@ Consider setting the following command-line flags:
|
||||
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
|
||||
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
|
||||
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://victoriametrics.github.io/vmauth.html)
|
||||
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://docs.victoriametrics.com/vmauth.html)
|
||||
or similar auth proxy.
|
||||
|
||||
|
||||
@@ -1281,7 +1278,7 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
|
||||
## Monitoring
|
||||
|
||||
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
|
||||
These metrics may be collected by [vmagent](https://victoriametrics.github.io/vmagent.html)
|
||||
These metrics may be collected by [vmagent](https://docs.victoriametrics.com/vmagent.html)
|
||||
or Prometheus by adding the corresponding scrape config to it.
|
||||
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
|
||||
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
|
||||
@@ -1289,6 +1286,8 @@ For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics`
|
||||
There are officials Grafana dashboards for [single-node VictoriaMetrics](https://grafana.com/dashboards/10229) and [clustered VictoriaMetrics](https://grafana.com/grafana/dashboards/11176).
|
||||
There is also an [alternative dashboard for clustered VictoriaMetrics](https://grafana.com/grafana/dashboards/11831).
|
||||
|
||||
It is recommended setting up alerts in [vmalert](https://docs.victoriametrics.com/vmalert.html) or in Prometheus from [this config](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
|
||||
|
||||
The most interesting metrics are:
|
||||
|
||||
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
|
||||
@@ -1309,15 +1308,53 @@ VictoriaMetrics also exposes currently running queries with their execution time
|
||||
|
||||
See the example of alerting rules for VM components [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
|
||||
|
||||
|
||||
## TSDB stats
|
||||
|
||||
VictoriaMetrics returns TSDB stats at `/api/v1/status/tsdb` page in the way similar to Prometheus - see [these Prometheus docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). VictoriaMetrics accepts the following optional query args at `/api/v1/status/tsdb` page:
|
||||
* `topN=N` where `N` is the number of top entries to return in the response. By default top 10 entries are returned.
|
||||
* `date=YYYY-MM-DD` where `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
|
||||
* `match[]=SELECTOR` where `SELECTOR` is an arbitrary [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) for series to take into account during stats calculation. By default all the series are taken into account.
|
||||
* `extra_label=LABEL=VALUE`. See [these docs](#prometheus-querying-api-enhancements) for more details.
|
||||
|
||||
|
||||
## Cardinality limiter
|
||||
|
||||
By default VictoriaMetrics doesn't limit the number of stored time series. The limit can be enforced by setting the following command-line flags:
|
||||
|
||||
* `-storage.maxHourlySeries` - limits the number of time series that can be added during the last hour. Useful for limiting the number of active time series.
|
||||
* `-storage.maxDailySeries` - limits the number of time series that can be added during the last day. Useful for limiting daily churn rate.
|
||||
|
||||
Both limits can be set simultaneously. If any of these limits is reached, then incoming samples for new time series are dropped. A sample of dropped series is put in the log with `WARNING` level.
|
||||
|
||||
The exceeded limits can be [monitored](#monitoring) with the following metrics:
|
||||
|
||||
* `vm_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
|
||||
* `vm_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
|
||||
|
||||
These limits are approximate, so VictoriaMetrics can underflow/overflow the limit by a small percentage (usually less than 1%).
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* It is recommended to use default command-line flag values (i.e. don't set them explicitly) until the need
|
||||
of tweaking these flag values arises.
|
||||
|
||||
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
|
||||
|
||||
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
|
||||
since the encountered issue could be already fixed there.
|
||||
|
||||
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
|
||||
* It is recommended to have at least 50% of spare resources for CPU, disk IO and RAM, so VictoriaMetrics could handle short spikes in the workload without performance issues.
|
||||
|
||||
* VictoriaMetrics requires free disk space for [merging data files to bigger ones](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
It may slow down when there is no enough free space left. So make sure `-storageDataPath` directory
|
||||
has at least 20% of free space. The remaining amount of free space
|
||||
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
|
||||
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
|
||||
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
|
||||
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
|
||||
which would start background merge if they had more free disk space.
|
||||
|
||||
* VictoriaMetrics buffers incoming data in memory for up to a few seconds before flushing it to persistent storage.
|
||||
This may lead to the following "issues":
|
||||
@@ -1328,22 +1365,16 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
|
||||
|
||||
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
|
||||
then it is likely you have too many active time series for the current amount of RAM.
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
|
||||
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics such as `vm_slow_row_inserts_total` and `vm_slow_metric_name_loads_total`, which could be used
|
||||
as an indicator of low amounts of RAM. It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
|
||||
ingestion and query performance in this case.
|
||||
|
||||
* If the order of labels for the same metrics can change over time (e.g. if `metric{k1="v1",k2="v2"}` may become `metric{k2="v2",k1="v1"}`),
|
||||
then it is recommended running VictoriaMetrics with `-sortLabels` command-line flag in order to reduce memory usage and CPU usage.
|
||||
|
||||
* VictoriaMetrics prioritizes data ingestion over data querying. So if it has no enough resources for data ingestion,
|
||||
then data querying may slow down significantly.
|
||||
|
||||
* VictoriaMetrics requires free disk space for [merging data files to bigger ones](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
|
||||
It may slow down when there is no enough free space left. So make sure `-storageDataPath` directory
|
||||
has at least 20% of free space comparing to disk size. The remaining amount of free space
|
||||
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
|
||||
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
|
||||
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
|
||||
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
|
||||
which would start background merge if they had more free disk space.
|
||||
|
||||
* If VictoriaMetrics doesn't work because of certain parts are corrupted due to disk errors,
|
||||
then just remove directories with broken parts. It is safe removing subdirectories under `<-storageDataPath>/data/{big,small}/YYYY_MM` directories
|
||||
when VictoriaMetrics isn't running. This recovers VictoriaMetrics at the cost of data loss stored in the deleted broken parts.
|
||||
@@ -1360,10 +1391,9 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
|
||||
It may be needed in order to suppress default gap filling algorithm used by VictoriaMetrics - by default it assumes
|
||||
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
|
||||
|
||||
* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page.
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
|
||||
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
|
||||
while `topN` equals to 10.
|
||||
* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page. See [these docs](#tsdb-stats) for details.
|
||||
|
||||
* New time series can be logged if `-logNewSeries` command-line flag is passed to VictoriaMetrics.
|
||||
|
||||
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
|
||||
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
|
||||
@@ -1375,15 +1405,20 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
|
||||
* VictoriaMetrics ignores `NaN` values during data ingestion.
|
||||
|
||||
|
||||
## Cache removal
|
||||
|
||||
VictoriaMetrics uses various internal caches. These caches are stored to `<-storageDataPath>/cache` directory during graceful shutdown (e.g. when VictoriaMetrics is stopped by sending `SIGINT` signal). The caches are read on the next VictoriaMetrics startup. Sometimes it is needed to remove such caches on the next startup. This can be performed by placing `reset_cache_on_startup` file inside the `<-storageDataPath>/cache` directory before the restart of VictoriaMetrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1447) for details.
|
||||
|
||||
|
||||
## Data migration
|
||||
|
||||
Use [vmctl](https://victoriametrics.github.io/vmctl.html) for data migration. It supports the following data migration types:
|
||||
Use [vmctl](https://docs.victoriametrics.com/vmctl.html) for data migration. It supports the following data migration types:
|
||||
|
||||
* From Prometheus to VictoriaMetrics
|
||||
* From InfluxDB to VictoriaMetrics
|
||||
* From VictoriaMetrics to VictoriaMetrics
|
||||
|
||||
See [vmctl docs](https://victoriametrics.github.io/vmctl.html) for more details.
|
||||
See [vmctl docs](https://docs.victoriametrics.com/vmctl.html) for more details.
|
||||
|
||||
|
||||
## Backfilling
|
||||
@@ -1414,7 +1449,7 @@ should be used only for one-off updates. It shouldn't be used for frequent updat
|
||||
## Replication
|
||||
|
||||
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
|
||||
See [these docs](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#replication-and-data-safety) for details.
|
||||
See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#replication-and-data-safety) for details.
|
||||
|
||||
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
|
||||
|
||||
@@ -1423,8 +1458,8 @@ See also [high availability docs](#high-availability) and [backup docs](#backups
|
||||
|
||||
## Backups
|
||||
|
||||
VictoriaMetrics supports backups via [vmbackup](https://victoriametrics.github.io/vmbackup.html)
|
||||
and [vmrestore](https://victoriametrics.github.io/vmrestore.html) tools.
|
||||
VictoriaMetrics supports backups via [vmbackup](https://docs.victoriametrics.com/vmbackup.html)
|
||||
and [vmrestore](https://docs.victoriametrics.com/vmrestore.html) tools.
|
||||
We also provide `vmbackupmanager` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
|
||||
|
||||
|
||||
@@ -1460,7 +1495,7 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
|
||||
* [Ansible role for installing single-node VictoriaMetrics](https://github.com/dreamteam-gg/ansible-victoriametrics-role).
|
||||
* [Ansible role for installing cluster VictoriaMetrics](https://github.com/Slapper/ansible-victoriametrics-cluster-role).
|
||||
* [Snap package for VictoriaMetrics](https://snapcraft.io/victoriametrics).
|
||||
* [vmalert-cli](https://github.com/aorfanos/vmalert-cli) - a CLI application for managing [vmalert](https://victoriametrics.github.io/vmalert.html).
|
||||
* [vmalert-cli](https://github.com/aorfanos/vmalert-cli) - a CLI application for managing [vmalert](https://docs.victoriametrics.com/vmalert.html).
|
||||
|
||||
|
||||
## Third-party contributions
|
||||
@@ -1480,7 +1515,7 @@ Contact us with any questions regarding VictoriaMetrics at [info@victoriametrics
|
||||
|
||||
Feel free asking any questions regarding VictoriaMetrics:
|
||||
|
||||
* [slack](http://slack.victoriametrics.com/)
|
||||
* [slack](https://slack.victoriametrics.com/)
|
||||
* [reddit](https://www.reddit.com/r/VictoriaMetrics/)
|
||||
* [telegram-en](https://t.me/VictoriaMetrics_en)
|
||||
* [telegram-ru](https://t.me/VictoriaMetrics_ru1)
|
||||
@@ -1508,7 +1543,7 @@ Adhering `KISS` principle simplifies the resulting code and architecture, so it
|
||||
Report bugs and propose new features [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues).
|
||||
|
||||
|
||||
## Victoria Metrics Logo
|
||||
## VictoriaMetrics Logo
|
||||
|
||||
[Zip](VM_logo.zip) contains three folders with different image orientations (main color and inverted version).
|
||||
|
||||
@@ -1536,3 +1571,268 @@ Files included in each folder:
|
||||
* There should be sufficient clear space around the logo.
|
||||
* Do not change spacing, alignment, or relative locations of the design elements.
|
||||
* Do not change the proportions of any of the design elements or the design itself. You may resize as needed but must retain all proportions.
|
||||
|
||||
|
||||
## List of command-line flags
|
||||
|
||||
Pass `-help` to VictoriaMetrics in order to see the list of supported command-line flags with their description:
|
||||
|
||||
```
|
||||
-bigMergeConcurrency int
|
||||
The maximum number of CPU cores to use for big merges. Default value is used if set to 0
|
||||
-csvTrimTimestamp duration
|
||||
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-dedup.minScrapeInterval duration
|
||||
Leave only the first sample in every time series per each discrete interval equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details
|
||||
-deleteAuthKey string
|
||||
authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series and /tags/delSeries
|
||||
-denyQueriesOutsideRetention
|
||||
Whether to deny queries outside of the configured -retentionPeriod. When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. This may be useful when multiple data sources with distinct retentions are hidden behind query-tee
|
||||
-dryRun
|
||||
Whether to check only -promscrape.config and then exit. Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-finalMergeDelay duration
|
||||
The delay before starting final merge for per-month partition after no new data is ingested into it. Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. Zero value disables final merge
|
||||
-forceFlushAuthKey string
|
||||
authKey, which must be passed in query string to /internal/force_flush pages
|
||||
-forceMergeAuthKey string
|
||||
authKey, which must be passed in query string to /internal/force_merge pages
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-graphiteListenAddr string
|
||||
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty
|
||||
-graphiteTrimTimestamp duration
|
||||
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address to listen for http connections (default ":8428")
|
||||
-import.maxLineLen size
|
||||
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
|
||||
-influx.databaseNames array
|
||||
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-influx.maxLineSize size
|
||||
The maximum size in bytes for a single Influx line during parsing
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
|
||||
-influxListenAddr string
|
||||
TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<victoriametrics>:8428/write
|
||||
-influxMeasurementFieldSeparator string
|
||||
Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol (default "_")
|
||||
-influxSkipMeasurement
|
||||
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
|
||||
-influxSkipSingleField
|
||||
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field
|
||||
-influxTrimTimestamp duration
|
||||
Trim timestamps for Influx line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-insert.maxQueueDuration duration
|
||||
The maximum duration for waiting in the queue for insert requests due to -maxConcurrentInserts (default 1m0s)
|
||||
-logNewSeries
|
||||
Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxConcurrentInserts int
|
||||
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
|
||||
-maxInsertRequestSize size
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-maxLabelsPerTimeseries int
|
||||
The maximum number of labels accepted per time series. Superfluous labels are dropped (default 30)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-opentsdbHTTPListenAddr string
|
||||
TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty
|
||||
-opentsdbListenAddr string
|
||||
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
|
||||
-opentsdbTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-opentsdbhttp.maxInsertRequestSize size
|
||||
The maximum size of OpenTSDB HTTP put request
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-opentsdbhttpTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It overrides httpAuth settings
|
||||
-precisionBits int
|
||||
The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64)
|
||||
-promscrape.cluster.memberNum int
|
||||
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
|
||||
-promscrape.cluster.membersCount int
|
||||
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
-promscrape.cluster.replicationFactor int
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
-promscrape.config string
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
-promscrape.config.dryRun
|
||||
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
|
||||
-promscrape.config.strictParse
|
||||
Whether to allow only supported fields in -promscrape.config . By default unsupported fields are silently skipped
|
||||
-promscrape.configCheckInterval duration
|
||||
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
|
||||
-promscrape.digitaloceanSDCheckInterval duration
|
||||
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
|
||||
-promscrape.disableCompression
|
||||
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.disableKeepAlive
|
||||
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
|
||||
-promscrape.discovery.concurrency int
|
||||
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
|
||||
-promscrape.discovery.concurrentWaitTime duration
|
||||
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
|
||||
-promscrape.dnsSDCheckInterval duration
|
||||
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
|
||||
-promscrape.dockerSDCheckInterval duration
|
||||
Interval for checking for changes in docker. This works only if docker_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config for details (default 30s)
|
||||
-promscrape.dockerswarmSDCheckInterval duration
|
||||
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
|
||||
-promscrape.dropOriginalLabels
|
||||
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
|
||||
-promscrape.ec2SDCheckInterval duration
|
||||
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
|
||||
-promscrape.eurekaSDCheckInterval duration
|
||||
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
|
||||
-promscrape.fileSDCheckInterval duration
|
||||
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 30s)
|
||||
-promscrape.gceSDCheckInterval duration
|
||||
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
|
||||
-promscrape.httpSDCheckInterval duration
|
||||
Interval for checking for changes in http endpoint service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
|
||||
-promscrape.kubernetes.apiServerTimeout duration
|
||||
How frequently to reload the full state from Kuberntes API server (default 30m0s)
|
||||
-promscrape.kubernetesSDCheckInterval duration
|
||||
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
|
||||
-promscrape.maxDroppedTargets int
|
||||
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
|
||||
-promscrape.maxScrapeSize size
|
||||
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
|
||||
-promscrape.openstackSDCheckInterval duration
|
||||
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
|
||||
-promscrape.streamParse
|
||||
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.suppressDuplicateScrapeTargetErrors
|
||||
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
|
||||
-promscrape.suppressScrapeErrors
|
||||
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
|
||||
-relabelConfig string
|
||||
Optional path to a file with relabeling rules, which are applied to all the ingested metrics. See https://docs.victoriametrics.com/#relabeling for details
|
||||
-relabelDebug
|
||||
Whether to log metrics before and after relabeling with -relabelConfig. If the -relabelDebug is enabled, then the metrics aren't sent to storage. This is useful for debugging the relabeling configs
|
||||
-retentionPeriod value
|
||||
Data with timestamps outside the retentionPeriod is automatically deleted
|
||||
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
|
||||
-search.cacheTimestampOffset duration
|
||||
The maximum duration since the current time for response data, which is always queried from the original raw data, without using the response cache. Increase this value if you see gaps in responses due to time synchronization issues between VictoriaMetrics and data sources (default 5m0s)
|
||||
-search.disableCache
|
||||
Whether to disable response caching. This may be useful during data backfilling
|
||||
-search.latencyOffset duration
|
||||
The time when data points become visible in query results after the collection. Too small value can result in incomplete last points for query results (default 30s)
|
||||
-search.logSlowQueryDuration duration
|
||||
Log queries with execution time exceeding this value. Zero disables slow query logging (default 5s)
|
||||
-search.maxConcurrentRequests int
|
||||
The maximum number of concurrent search requests. It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration (default 8)
|
||||
-search.maxExportDuration duration
|
||||
The maximum duration for /api/v1/export call (default 720h0m0s)
|
||||
-search.maxLookback duration
|
||||
Synonym to -search.lookback-delta from Prometheus. The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. See also '-search.maxStalenessInterval' flag, which has the same meaining due to historical reasons
|
||||
-search.maxPointsPerTimeseries int
|
||||
The maximum points per a single timeseries returned from /api/v1/query_range. This option doesn't limit the number of scanned raw samples in the database. The main purpose of this option is to limit the number of per-series points returned to graphing UI such as Grafana. There is no sense in setting this limit to values bigger than the horizontal resolution of the graph (default 30000)
|
||||
-search.maxQueryDuration duration
|
||||
The maximum duration for query execution (default 30s)
|
||||
-search.maxQueryLen size
|
||||
The maximum search query length in bytes
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
|
||||
-search.maxQueueDuration duration
|
||||
The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
|
||||
-search.maxStalenessInterval duration
|
||||
The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons
|
||||
-search.maxStatusRequestDuration duration
|
||||
The maximum duration for /api/v1/status/* requests (default 5m0s)
|
||||
-search.maxStepForPointsAdjustment duration
|
||||
The maximum step when /api/v1/query_range handler adjusts points with timestamps closer than -search.latencyOffset to the current time. The adjustment is needed because such points may contain incomplete data (default 1m0s)
|
||||
-search.maxTagKeys int
|
||||
The maximum number of tag keys returned from /api/v1/labels (default 100000)
|
||||
-search.maxTagValueSuffixesPerSearch int
|
||||
The maximum number of tag value suffixes returned from /metrics/find (default 100000)
|
||||
-search.maxTagValues int
|
||||
The maximum number of tag values returned from /api/v1/label/<label_name>/values (default 100000)
|
||||
-search.maxUniqueTimeseries int
|
||||
The maximum number of unique time series each search can scan (default 300000)
|
||||
-search.minStalenessInterval duration
|
||||
The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval'
|
||||
-search.queryStats.lastQueriesCount int
|
||||
Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000)
|
||||
-search.queryStats.minQueryDuration duration
|
||||
The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats
|
||||
-search.resetCacheAuthKey string
|
||||
Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call
|
||||
-search.treatDotsAsIsInRegexps
|
||||
Whether to treat dots as is in regexp label filters used in queries. For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter
|
||||
-selfScrapeInstance string
|
||||
Value for 'instance' label, which is added to self-scraped metrics (default "self")
|
||||
-selfScrapeInterval duration
|
||||
Interval for self-scraping own metrics at /metrics page
|
||||
-selfScrapeJob string
|
||||
Value for 'job' label, which is added to self-scraped metrics (default "victoria-metrics")
|
||||
-smallMergeConcurrency int
|
||||
The maximum number of CPU cores to use for small merges. Default value is used if set to 0
|
||||
-snapshotAuthKey string
|
||||
authKey, which must be passed in query string to /snapshot* pages
|
||||
-sortLabels
|
||||
Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit
|
||||
-storage.maxDailySeries int
|
||||
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
|
||||
-storage.maxHourlySeries int
|
||||
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
|
||||
-storageDataPath string
|
||||
Path to storage data (default "victoria-metrics-data")
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
BIN
VM_logo.zip
BIN
VM_logo.zip
Binary file not shown.
@@ -3,10 +3,8 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
@@ -26,9 +24,8 @@ import (
|
||||
|
||||
var (
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8428", "TCP address to listen for http connections")
|
||||
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
|
||||
"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
|
||||
"Deduplication is disabled if the -dedup.minScrapeInterval is 0")
|
||||
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the first sample in every time series per each discrete interval "+
|
||||
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only -promscrape.config and then exit. "+
|
||||
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
|
||||
)
|
||||
@@ -86,14 +83,20 @@ func main() {
|
||||
|
||||
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
if r.URL.Path == "/" {
|
||||
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics.</h2></br>")
|
||||
fmt.Fprintf(w, "See docs at <a href='https://victoriametrics.github.io/'>https://victoriametrics.github.io/</a></br>")
|
||||
fmt.Fprintf(w, "Useful endpoints: </br>")
|
||||
writeAPIHelp(w, [][]string{
|
||||
if r.Method != "GET" {
|
||||
return false
|
||||
}
|
||||
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics</h2></br>")
|
||||
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/'>https://docs.victoriametrics.com/</a></br>")
|
||||
fmt.Fprintf(w, "Useful endpoints:</br>")
|
||||
httpserver.WriteAPIHelp(w, [][2]string{
|
||||
{"/vmui", "Web UI"},
|
||||
{"/targets", "discovered targets list"},
|
||||
{"/api/v1/targets", "advanced information about discovered targets in JSON format"},
|
||||
{"/metrics", "available service metrics"},
|
||||
{"/api/v1/status/tsdb", "tsdb status page"},
|
||||
{"/api/v1/status/top_queries", "top queries"},
|
||||
{"/api/v1/status/active_queries", "active queries"},
|
||||
})
|
||||
return true
|
||||
}
|
||||
@@ -109,20 +112,11 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func writeAPIHelp(w io.Writer, pathList [][]string) {
|
||||
pathPrefix := httpserver.GetPathPrefix()
|
||||
for _, p := range pathList {
|
||||
p, doc := p[0], p[1]
|
||||
p = path.Join(pathPrefix, p)
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
}
|
||||
}
|
||||
|
||||
func usage() {
|
||||
const s = `
|
||||
victoria-metrics is a time series database and monitoring solution.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/
|
||||
See the docs at https://docs.victoriametrics.com/
|
||||
`
|
||||
flagutil.Usage(s)
|
||||
}
|
||||
|
||||
@@ -22,7 +22,6 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -149,7 +148,7 @@ func setUp() {
|
||||
}
|
||||
|
||||
func processFlags() {
|
||||
envflag.Parse()
|
||||
flag.Parse()
|
||||
for _, fv := range []struct {
|
||||
flag string
|
||||
value string
|
||||
|
||||
@@ -78,3 +78,9 @@ vmagent-local-with-goarch:
|
||||
|
||||
vmagent-pure:
|
||||
APP_NAME=vmagent $(MAKE) app-local-pure
|
||||
|
||||
vmagent-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmagent $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmagent-windows-amd64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
## vmagent
|
||||
# vmagent
|
||||
|
||||
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
|
||||
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
|
||||
`vmagent` is a tiny but mighty agent which helps you collect metrics from various sources
|
||||
and store them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
or any other Prometheus-compatible storage systems that support the `remote_write` protocol.
|
||||
|
||||
<img alt="vmagent" src="vmagent.png">
|
||||
|
||||
@@ -10,40 +10,42 @@ or any other Prometheus-compatible storage system that supports the `remote_writ
|
||||
## Motivation
|
||||
|
||||
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
|
||||
Also, we found that users’ infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
|
||||
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
|
||||
and RAM friendly to scrape metrics from Prometheus-compatible exporters into VictoriaMetrics.
|
||||
Also, we found that our user's infrastructure are like snowflakes in that no two are alike. Therefore we decided to add more flexibility
|
||||
to `vmagent` such as the ability to push metrics instead of pulling them. We did our best and will continue to improve vmagent.
|
||||
|
||||
|
||||
## Features
|
||||
|
||||
* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
|
||||
* Can be used as a drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
|
||||
See [Quick Start](#quick-start) for details.
|
||||
* Can add, remove and modify labels (aka tags) via Prometheus relabeling. Can filter data before sending it to remote storage. See [these docs](#relabeling) for details.
|
||||
* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
|
||||
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
|
||||
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
|
||||
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
|
||||
* Accepts data via all ingestion protocols supported by VictoriaMetrics:
|
||||
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
|
||||
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
|
||||
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
|
||||
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
|
||||
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
|
||||
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
|
||||
* Data in Prometheus exposition format. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
|
||||
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
|
||||
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
|
||||
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
|
||||
* Data in Prometheus exposition format. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
|
||||
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
|
||||
* Can replicate collected metrics simultaneously to multiple remote storage systems.
|
||||
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
|
||||
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
|
||||
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets) for details.
|
||||
* Works smoothly in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
|
||||
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as the connection
|
||||
to the remote storage is repaired. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared with Prometheus.
|
||||
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets).
|
||||
* Can efficiently scrape targets that expose millions of time series such as [/federate endpoint in Prometheus](https://prometheus.io/docs/prometheus/latest/federation/). See [these docs](#stream-parsing-mode).
|
||||
* Can deal with high cardinality and high churn rate issues by limiting the number of unique time series sent to remote storage systems. See [these docs](#cardinality-limiter).
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
|
||||
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
|
||||
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
|
||||
and configure the following flags to the `vmagent` binary in order to start scraping Prometheus targets:
|
||||
|
||||
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary number of remote storage systems.
|
||||
* `-promscrape.config` with the path to Prometheus config file (usually located at `/etc/prometheus/prometheus.yml`)
|
||||
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics, the `-remoteWrite.url` argument can be specified multiple times to replicate data concurrently to an arbitrary number of remote storage systems.
|
||||
|
||||
Example command line:
|
||||
|
||||
@@ -51,17 +53,17 @@ Example command line:
|
||||
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
If you only need to collect Influx data, then the following is sufficient:
|
||||
If you only need to collect Influx data, then the following command is sufficient:
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
|
||||
```
|
||||
|
||||
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
|
||||
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
|
||||
|
||||
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
|
||||
|
||||
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
|
||||
Pass `-help` to `vmagent` in order to see [the full list of supported command-line flags with their descriptions](#advanced-usage).
|
||||
|
||||
|
||||
## Configuration update
|
||||
@@ -85,140 +87,150 @@ There is also `-promscrape.configCheckInterval` command-line option, which can b
|
||||
|
||||
### IoT and Edge monitoring
|
||||
|
||||
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to the remote storage.
|
||||
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to their remote storage.
|
||||
It buffers the collected data in local files until the connection to remote storage becomes available and then sends the buffered
|
||||
data to the remote storage. It re-tries sending the data to remote storage on any errors.
|
||||
data to the remote storage. It re-tries sending the data to remote storage until any errors are resolved.
|
||||
The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
|
||||
|
||||
`vmagent` works on various architectures from IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
|
||||
`vmagent` works on various architectures from the IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
|
||||
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
|
||||
|
||||
|
||||
### Drop-in replacement for Prometheus
|
||||
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
|
||||
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
|
||||
If you use Prometheus only for scraping metrics from various targets and forwarding those metrics to remote storage
|
||||
then `vmagent` can replace Prometheus. Typically, `vmagent` requires lower amounts of RAM, CPU and network bandwidth compared with Prometheus.
|
||||
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
|
||||
|
||||
|
||||
### Replication and high availability
|
||||
|
||||
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
|
||||
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
|
||||
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instance.
|
||||
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again and then it sends the buffered data to the remote storage in order to prevent data gaps.
|
||||
|
||||
|
||||
### Relabeling and filtering
|
||||
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
|
||||
`vmagent` can add, remove or update labels on the collected data before sending it to the remote storage. Additionally,
|
||||
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
|
||||
See [these docs](#relabeling) for details.
|
||||
Please see [these docs](#relabeling) for details.
|
||||
|
||||
|
||||
### Splitting data streams among multiple systems
|
||||
|
||||
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
which is applied independently for each configured `-remoteWrite.url` destination. For example, it is possible to replicate or split
|
||||
data among long-term remote storage, short-term remote storage and a real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
|
||||
Note that each destination can receive it's own subset of the collected data due to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
|
||||
|
||||
|
||||
### Prometheus remote_write proxy
|
||||
|
||||
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
|
||||
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
|
||||
`vmagent` can be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via the `remote_write` API
|
||||
at the`/api/v1/write` endpoint. Then apply relabeling and filtering and proxy it to another `remote_write` system .
|
||||
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
|
||||
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
|
||||
Also, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
|
||||
|
||||
|
||||
### remote_write for clustered version
|
||||
|
||||
Despite `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes always peformed in Promethes remote_write protocol. Therefore for clustered version `-remoteWrite.url` command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<customer-id>/prometheus/api/v1/write`
|
||||
While `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes are always peformed in Promethes remote_write protocol. Therefore for the [clustered version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html), `-remoteWrite.url` the command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<customer-id>/prometheus/api/v1/write`
|
||||
|
||||
|
||||
## How to collect metrics in Prometheus format
|
||||
|
||||
Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
|
||||
Specify the path to `prometheus.yml` file via `-promscrape.config` command-line flag. `vmagent` takes into account the following
|
||||
sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
|
||||
|
||||
* `global`
|
||||
* `scrape_configs`
|
||||
|
||||
All the other sections are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
|
||||
Use `-remoteWrite.*` command-line flags instead for configuring remote write settings.
|
||||
All other sections are ignored, including the [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
|
||||
Use `-remoteWrite.*` command-line flag instead for configuring remote write settings.
|
||||
|
||||
The following scrape types in [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) section are supported:
|
||||
|
||||
* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
|
||||
* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
|
||||
* `static_configs` - is for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
|
||||
* `file_sd_configs` - is for scraping targets defined in external files (aka file-based service discover).
|
||||
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details
|
||||
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
|
||||
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
|
||||
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
|
||||
* `ec2_sd_configs` - is for scraping targets in Amazon EC2.
|
||||
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
|
||||
`vmagent` doesn't support `profile` config param and aws credentials file yet.
|
||||
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
|
||||
`vmagent` doesn't support the `profile` config param yet.
|
||||
* `gce_sd_configs` - is for scraping targets in Google Compute Engine (GCE).
|
||||
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
|
||||
`vmagent` provides the following additional functionality for `gce_sd_config`:
|
||||
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
|
||||
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
|
||||
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
|
||||
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
|
||||
* `consul_sd_configs` - for scraping targets registered in Consul.
|
||||
* if `project` arg is missing then `vmagent` uses the project for the instance where it runs;
|
||||
* if `zone` arg is missing then `vmagent` uses the zone for the instance where it runs;
|
||||
* if `zone` arg is equal to `"*"`, then `vmagent` discovers all the zones for the given project;
|
||||
* `zone` may contain an arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
|
||||
* `consul_sd_configs` - is for scraping the targets registered in Consul.
|
||||
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
|
||||
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
* `dns_sd_configs` - is for scraping targets discovered from DNS records (SRV, A and AAAA).
|
||||
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
|
||||
* `openstack_sd_configs` - for scraping OpenStack targets.
|
||||
* `openstack_sd_configs` - is for scraping OpenStack targets.
|
||||
See [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) for details.
|
||||
[OpenStack identity API v3](https://docs.openstack.org/api-ref/identity/v3/) is supported only.
|
||||
* `dockerswarm_sd_configs` - for scraping Docker Swarm targets.
|
||||
* `docker_sd_configs` - is for scraping Docker targets.
|
||||
See [docker_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config) for details.
|
||||
* `dockerswarm_sd_configs` - is for scraping Docker Swarm targets.
|
||||
See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details.
|
||||
* `eureka_sd_configs` - for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
|
||||
* `eureka_sd_configs` - is for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
|
||||
See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details.
|
||||
* `digitalocean_sd_configs` is for scraping targerts registered in [DigitalOcean](https://www.digitalocean.com/)
|
||||
See [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) for details.
|
||||
* `http_sd_configs` is for scraping targerts registered in http service discovery.
|
||||
See [http_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config) for details.
|
||||
|
||||
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
Please file feature requests to [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
|
||||
|
||||
`vmagent` also support the following additional options in `scrape_config` section:
|
||||
`vmagent` also support the following additional options in `scrape_configs` section:
|
||||
|
||||
* `disable_compression: true` - for disabling response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
|
||||
in order to save network bandwidth.
|
||||
* `disable_keepalive: true` - for disabling [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
|
||||
By default `vmagent` uses keep-alive connections to scrape targets in order to reduce overhead on connection re-establishing.
|
||||
* `disable_compression: true` - to disable response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
|
||||
to save network bandwidth.
|
||||
* `disable_keepalive: true` - to disable [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
|
||||
By default, `vmagent` uses keep-alive connections to scrape targets to reduce overhead on connection re-establishing.
|
||||
* `stream_parse: true` - for scraping targets in a streaming manner. This may be useful for targets exporting big number of metrics. See [these docs](#stream-parsing-mode).
|
||||
|
||||
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
Note that `vmagent` doesn't support `refresh_interval` option for these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
|
||||
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
|
||||
entries to 60s. Run `vmagent -help` in order to see default values for the `-promscrape.*CheckInterval` flags.
|
||||
|
||||
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
|
||||
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders which are substituted by the corresponding `ENV_VAR` environment variable values.
|
||||
|
||||
|
||||
## Adding labels to metrics
|
||||
|
||||
Labels can be added to metrics via the following mechanisms:
|
||||
Labels can be added to metrics by the following mechanisms:
|
||||
|
||||
* Via `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in `-promscrape.config` file.
|
||||
* Via `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`.
|
||||
* The `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in the `-promscrape.config` file. They aren't added to metrics collected via other [data ingestion protocols](https://docs.victoriametrics.com/#how-to-import-time-series-data).
|
||||
* The `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`. For example, the following command will start `vmagent`, which will add `{datacenter="foobar"}` label to all the metrics pushed to all the configured remote storage systems (all the `-remoteWrite.url` flag values):
|
||||
|
||||
```
|
||||
/path/to/vmagent -remoteWrite.label=datacenter=foobar ...
|
||||
```
|
||||
|
||||
|
||||
## Relabeling
|
||||
|
||||
`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
|
||||
Additionally it provides the following extra actions:
|
||||
and also provides the following actions:
|
||||
|
||||
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
|
||||
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
|
||||
* `replace_all`: replaces all of the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the results in the `target_label`.
|
||||
* `labelmap_all`: replaces all of the occurences of `regex` in all the label names with the `replacement`.
|
||||
* `keep_if_equal`: keeps the entry if all the label values from `source_labels` are equal.
|
||||
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
|
||||
|
||||
The relabeling can be defined in the following places:
|
||||
|
||||
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
|
||||
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
|
||||
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
|
||||
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
|
||||
* At the `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels. This relabeling can be debugged by passing `relabel_debug: true` option to the corresponding `scrape_config` section. In this case `vmagent` logs target labels before and after the relabeling and then drops the logged target.
|
||||
* At the `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`. This relabeling can be debugged by passing `metric_relabel_debug: true` option to the corresponding `scrape_config` section. In this case `vmagent` logs metrics before and after the relabeling and then drops the logged metrics.
|
||||
* At the `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage. This relabeling can be debugged by passing `-remoteWrite.relabelDebug` command-line option to `vmagent`. In this case `vmagent` logs metrics before and after the relabeling and then drops all the logged metrics instead of sending them to remote storage.
|
||||
* At the `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`. This relabeling can be debugged by passing `-remoteWrite.urlRelabelDebug` command-line options to `vmagent`. In this case `vmagent` logs metrics before and after the relabeling and then drops all the logged metrics instead of sending them to the corresponding `-remoteWrite.url`.
|
||||
|
||||
Read more about relabeling in the following articles:
|
||||
You can read more about relabeling in the following articles:
|
||||
|
||||
* [How to use Relabeling in Prometheus and VictoriaMetrics](https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2)
|
||||
* [Life of a label](https://www.robustperception.io/life-of-a-label)
|
||||
@@ -228,60 +240,9 @@ Read more about relabeling in the following articles:
|
||||
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
|
||||
|
||||
|
||||
## Scraping big number of targets
|
||||
## Stream parsing mode
|
||||
|
||||
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
|
||||
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` clustering).
|
||||
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
|
||||
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
|
||||
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
|
||||
spread scrape targets among a cluster of two `vmagent` instances:
|
||||
|
||||
```
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
|
||||
```
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
|
||||
either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
|
||||
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
|
||||
If you have suggestions, improvements or found a bug - feel free to open an issue on github or add review to the dashboard.
|
||||
|
||||
`vmagent` also exports target statuses at the following handlers:
|
||||
|
||||
* `http://vmagent-host:8429/targets`. This handler returns human-readable plaintext status for every active target.
|
||||
This page is convenient to query from command line with `wget`, `curl` or similar tools.
|
||||
It accepts optional `show_original_labels=1` query arg, which shows the original labels per each target before applying relabeling.
|
||||
This information may be useful for debugging target relabeling.
|
||||
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
|
||||
|
||||
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes initialization for all service_discovery configs.
|
||||
It may be useful for performing `vmagent` rolling update without scrape loss.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* It is recommended [setting up the official Grafana dashboard](#monitoring) in order to monitor `vmagent` state.
|
||||
|
||||
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
|
||||
since `vmagent` establishes at least a single TCP connection per each target.
|
||||
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
|
||||
and `http://vmagent-host:8429/api/v1/targets`.
|
||||
|
||||
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
|
||||
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default up to `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value in order to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM, so big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if big number of scrape targets are dropped during relabeling.
|
||||
|
||||
* If `vmagent` scrapes big number of targets, then `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
|
||||
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
|
||||
|
||||
* If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
|
||||
then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
|
||||
by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
|
||||
By default `vmagent` reads the full response from scrape target into memory, then parses it, applies [relabeling](#relabeling) and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode. When this mode is enabled, then `vmagent` reads response from scrape target in chunks, then immediately processes every chunk and pushes the processed metrics to remote storage. This allows saving memory when scraping targets that expose millions of metrics. Stream parsing mode may be enabled either globally for all of the scrape targets by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
|
||||
|
||||
```yml
|
||||
scrape_configs:
|
||||
@@ -297,43 +258,176 @@ It may be useful for performing `vmagent` rolling update without scrape loss.
|
||||
'match[]': ['{__name__!=""}']
|
||||
```
|
||||
|
||||
Note that `sample_limit` option doesn't work if stream parsing is enabled, since the parsed data is pushed to remote storage as soon as it is parsed. So `sample_limit` option
|
||||
has no sense during stream parsing.
|
||||
Note that `sample_limit` option doesn't prevent from data push to remote storage if stream parsing is enabled because the parsed data is pushed to remote storage as soon as it is parsed.
|
||||
|
||||
* It is recommended to increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
|
||||
|
||||
* If you see gaps on the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, then try increasing `-remoteWrite.queues`.
|
||||
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage, so it starts dropping the buffered data
|
||||
## Scraping big number of targets
|
||||
|
||||
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
|
||||
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` horizontal scaling, sharding and clustering).
|
||||
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
|
||||
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
|
||||
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
|
||||
spread scrape targets among a cluster of two `vmagent` instances:
|
||||
|
||||
```
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
|
||||
```
|
||||
|
||||
By default each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
|
||||
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
|
||||
start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances:
|
||||
|
||||
```
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
|
||||
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=2 -promscrape.config=/path/to/config.yml ...
|
||||
```
|
||||
|
||||
If each target is scraped by multiple `vmagent` instances, then data deduplication must be enabled at remote storage pointed by `-remoteWrite.url`.
|
||||
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
|
||||
|
||||
|
||||
## Scraping targets via a proxy
|
||||
|
||||
`vmagent` supports scraping targets via http, https and socks5 proxies. Proxy address must be specified in `proxy_url` option. For example, the following scrape config instructs
|
||||
target scraping via https proxy at `https://proxy-addr:1234`:
|
||||
|
||||
```yml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
proxy_url: https://proxy-addr:1234
|
||||
```
|
||||
|
||||
Proxy can be configured with the following optional settings:
|
||||
|
||||
* `proxy_authorization` for generic token authorization. See [Prometheus docs for details on authorization section](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
|
||||
* `proxy_bearer_token` and `proxy_bearer_token_file` for Bearer token authorization
|
||||
* `proxy_basic_auth` for Basic authorization. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config).
|
||||
* `proxy_tls_config` for TLS config. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config).
|
||||
|
||||
For example:
|
||||
|
||||
```yml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
proxy_url: https://proxy-addr:1234
|
||||
proxy_basic_auth:
|
||||
username: foobar
|
||||
password: secret
|
||||
proxy_tls_config:
|
||||
insecure_skip_verify: true
|
||||
cert_file: /path/to/cert
|
||||
key_file: /path/to/key
|
||||
ca_file: /path/to/ca
|
||||
server_name: real-server-name
|
||||
```
|
||||
|
||||
## Cardinality limiter
|
||||
|
||||
By default `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`. The limit can be enforced by setting the following command-line flags:
|
||||
|
||||
* `-remoteWrite.maxHourlySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last hour. Useful for limiting the number of active time series.
|
||||
* `-remoteWrite.maxDailySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last day. Useful for limiting daily churn rate.
|
||||
|
||||
Both limits can be set simultaneously. If any of these limits is reached, then samples for new time series are dropped instead of sending them to remote storage systems. A sample of dropped series is put in the log with `WARNING` level.
|
||||
|
||||
The exceeded limits can be [monitored](#monitoring) with the following metrics:
|
||||
|
||||
* `vmagent_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
|
||||
* `vmagent_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
|
||||
|
||||
These limits are approximate, so `vmagent` can underflow/overflow the limit by a small percentage (usually less than 1%).
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. We recommend setting up regular scraping of this page
|
||||
either through `vmagent` itself or by Prometheus so that the exported metrics may be analyzed later.
|
||||
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
|
||||
If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard.
|
||||
|
||||
`vmagent` also exports the status for various targets at the following handlers:
|
||||
|
||||
* `http://vmagent-host:8429/targets`. This handler returns human-readable status for every active target.
|
||||
This page is easy to query from the command line with `wget`, `curl` or similar tools.
|
||||
It accepts optional `show_original_labels=1` query arg which shows the original labels per each target before applying the relabeling.
|
||||
This information may be useful for debugging target relabeling.
|
||||
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
|
||||
|
||||
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes it's initialization for all service_discovery configs.
|
||||
It may be useful to perform `vmagent` rolling update without any scrape loss.
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* We recommend you [set up the official Grafana dashboard](#monitoring) in order to monitor the state of `vmagent'.
|
||||
|
||||
* We recommend you increase the maximum number of open files in the system (`ulimit -n`) when scraping a big number of targets,
|
||||
as `vmagent` establishes at least a single TCP connection per target.
|
||||
|
||||
* If `vmagent` uses too big amounts of memory, then the following options can help:
|
||||
* Enabling stream parsing. See [these docs](#stream-parsing-mode).
|
||||
* Reducing the number of output queues with `-remoteWrite.queues` command-line option.
|
||||
* Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option. Another option is to reduce memory limits in Docker and/or Kuberntes if `vmagent` runs under these systems.
|
||||
* Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`, where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems.
|
||||
|
||||
* When `vmagent` scrapes many unreliable targets, it can flood the error log with scrape errors. These errors can be suppressed
|
||||
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
|
||||
and `http://vmagent-host:8429/api/v1/targets`.
|
||||
|
||||
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
|
||||
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default the `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM. Therefore big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if a big number of scrape targets are dropped during relabeling.
|
||||
|
||||
* If `vmagent` scrapes a big number of targets then the `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
|
||||
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
|
||||
|
||||
* If `vmagent` scrapes targets with millions of metrics per target (for example, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
|
||||
we recommend enabling [stream parsing mode](#stream-parsing-mode) in order to reduce memory usage during scraping.
|
||||
|
||||
* We recommend you increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page grows constantly.
|
||||
|
||||
* If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, try increasing `-remoteWrite.queues`.
|
||||
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage. Therefore it starts dropping the buffered data
|
||||
if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
|
||||
|
||||
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
* `vmagent` drops data blocks if remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses. The number of dropped blocks can be monitored via `vmagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring).
|
||||
|
||||
* Use `-remoteWrite.queues=1` when `-remoteWrite.url` points to remote storage, which doesn't accept out-of-order samples (aka data backfilling). Such storage systems include Prometheus, Cortex and Thanos.
|
||||
|
||||
* `vmagent` buffers scraped data at the `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
|
||||
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
|
||||
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
|
||||
If you don't want to send all the data from the directory to remote storage then simply stop `vmagent` and delete the directory.
|
||||
|
||||
* By default `vmagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because
|
||||
the url may contain sensitive information such as auth tokens or passwords.
|
||||
Pass `-remoteWrite.showURL` command-line flag when starting `vmagent` in order to see all the valid urls.
|
||||
|
||||
* If scrapes must be aligned in time (for instance, if they must be performed at the beginning of every hour), then set `scrape_align_interval` option
|
||||
in the corresponding scrape config. For example, the following config aligns hourly scrapes to the nearest 10 minutes:
|
||||
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at the beginning of some interval,
|
||||
then `scrape_align_interval` option must be used. For example, the following config aligns hourly scrapes to the beginning of hour:
|
||||
|
||||
```yml
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
scrape_interval: 1h
|
||||
scrape_align_interval: 10m
|
||||
scrape_align_interval: 1h
|
||||
```
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports
|
||||
or they use init container. These errors can be either fixed or suppressed with `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
|
||||
See available options below if you prefer fixing the root cause of the error:
|
||||
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at specific offset, then `scrape_offset` option must be used.
|
||||
For example, the following config instructs `vmagent` to scrape the target at 10 seconds of every minute:
|
||||
|
||||
The following `relabel_configs` section may help determining `__meta_*` labels resulting in duplicate targets:
|
||||
```yml
|
||||
- action: labelmap
|
||||
regex: __meta_(.*)
|
||||
scrape_configs:
|
||||
- job_name: foo
|
||||
scrape_interval: 1m
|
||||
scrape_offset: 10s
|
||||
```
|
||||
|
||||
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen to multiple ports
|
||||
or they use an init container. These errors can either be fixed or suppressed with the `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
|
||||
See the available options below if you prefer fixing the root cause of the error:
|
||||
|
||||
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
|
||||
```yml
|
||||
- action: keep_if_equal
|
||||
@@ -350,25 +444,25 @@ It may be useful for performing `vmagent` rolling update without scrape loss.
|
||||
|
||||
## How to build from sources
|
||||
|
||||
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in `vmutils-*` archives there.
|
||||
We recommend using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-*` archives .
|
||||
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmagent` from the root folder of the repository.
|
||||
It builds `vmagent` binary and puts it into the `bin` folder.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmagent` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds the `vmagent` binary and puts it into the `bin` folder.
|
||||
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmagent-prod` from the root folder of the repository.
|
||||
2. Run `make vmagent-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmagent-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
### Building docker images
|
||||
|
||||
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
|
||||
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
|
||||
`<PKG_TAG>` is an auto-generated image tag, which depends on source code in [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
|
||||
|
||||
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
|
||||
@@ -384,14 +478,14 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
|
||||
It builds `vmagent-arm` or `vmagent-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
### Production ARM build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of the repository.
|
||||
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmagent-arm-prod` or `vmagent-arm64-prod` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
|
||||
@@ -399,13 +493,13 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
|
||||
|
||||
* Memory profile. It can be collected with the following command:
|
||||
* Memory profile can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
|
||||
```
|
||||
|
||||
* CPU profile. It can be collected with the following command:
|
||||
* CPU profile can be collected with the following command:
|
||||
|
||||
```bash
|
||||
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
|
||||
@@ -423,16 +517,16 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
|
||||
```
|
||||
./vmagent -help
|
||||
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
|
||||
-csvTrimTimestamp duration
|
||||
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-dryRun
|
||||
Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
@@ -444,30 +538,33 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
-graphiteTrimTimestamp duration
|
||||
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address to listen for http connections. Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. Note that /targets and /metrics pages aren't available if -httpListenAddr='' (default ":8429")
|
||||
-import.maxLineLen max_rows_per_line
|
||||
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with max_rows_per_line query arg passed to /api/v1/export
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
|
||||
-influx.maxLineSize value
|
||||
-import.maxLineLen size
|
||||
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
|
||||
-influx.databaseNames array
|
||||
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-influx.maxLineSize size
|
||||
The maximum size in bytes for a single Influx line during parsing
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 262144)
|
||||
-influxListenAddr http://<vmagent>:8429/write
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
|
||||
-influxListenAddr string
|
||||
TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write
|
||||
-influxMeasurementFieldSeparator string
|
||||
Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol (default "_")
|
||||
@@ -482,7 +579,7 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -492,17 +589,17 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxConcurrentInserts int
|
||||
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
|
||||
-maxInsertRequestSize value
|
||||
-maxInsertRequestSize size
|
||||
The maximum size in bytes of a single Prometheus remote_write API request
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-memory.allowedBytes value
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-opentsdbHTTPListenAddr string
|
||||
@@ -511,9 +608,9 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
|
||||
-opentsdbTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
|
||||
-opentsdbhttp.maxInsertRequestSize value
|
||||
-opentsdbhttp.maxInsertRequestSize size
|
||||
The maximum size of OpenTSDB HTTP put request
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
|
||||
-opentsdbhttpTrimTimestamp duration
|
||||
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
|
||||
-pprofAuthKey string
|
||||
@@ -522,85 +619,120 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
|
||||
-promscrape.cluster.membersCount int
|
||||
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
|
||||
-promscrape.cluster.replicationFactor int
|
||||
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
|
||||
-promscrape.config string
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://victoriametrics.github.io/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
|
||||
-promscrape.config.dryRun
|
||||
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
|
||||
-promscrape.config.strictParse
|
||||
Whether to allow only supported fields in -promscrape.config . By default unsupported fields are silently skipped
|
||||
-promscrape.configCheckInterval duration
|
||||
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
-promscrape.consulSDCheckInterval consul_sd_configs
|
||||
-promscrape.consul.waitTime duration
|
||||
Wait time used by Consul service discovery. Default value is used if not set
|
||||
-promscrape.consulSDCheckInterval duration
|
||||
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
|
||||
-promscrape.digitaloceanSDCheckInterval duration
|
||||
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
|
||||
-promscrape.disableCompression
|
||||
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.disableKeepAlive disable_keepalive: true
|
||||
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set disable_keepalive: true individually per each 'scrape_config` section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
|
||||
-promscrape.disableKeepAlive
|
||||
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
|
||||
-promscrape.discovery.concurrency int
|
||||
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
|
||||
-promscrape.discovery.concurrentWaitTime duration
|
||||
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
|
||||
-promscrape.dnsSDCheckInterval dns_sd_configs
|
||||
-promscrape.dnsSDCheckInterval duration
|
||||
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
|
||||
-promscrape.dockerswarmSDCheckInterval dockerswarm_sd_configs
|
||||
-promscrape.dockerswarmSDCheckInterval duration
|
||||
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
|
||||
-promscrape.dropOriginalLabels
|
||||
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
|
||||
-promscrape.ec2SDCheckInterval ec2_sd_configs
|
||||
-promscrape.ec2SDCheckInterval duration
|
||||
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
|
||||
-promscrape.eurekaSDCheckInterval eureka_sd_configs
|
||||
-promscrape.eurekaSDCheckInterval duration
|
||||
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
|
||||
-promscrape.fileSDCheckInterval duration
|
||||
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 30s)
|
||||
-promscrape.gceSDCheckInterval gce_sd_configs
|
||||
-promscrape.gceSDCheckInterval duration
|
||||
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
|
||||
-promscrape.httpSDCheckInterval duration
|
||||
Interval for checking for changes in http service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
|
||||
-promscrape.kubernetes.apiServerTimeout duration
|
||||
How frequently to reload the full state from Kuberntes API server (default 10m0s)
|
||||
-promscrape.kubernetesSDCheckInterval kubernetes_sd_configs
|
||||
How frequently to reload the full state from Kuberntes API server (default 30m0s)
|
||||
-promscrape.kubernetesSDCheckInterval duration
|
||||
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
|
||||
-promscrape.maxDroppedTargets droppedTargets
|
||||
The maximum number of droppedTargets shown at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
|
||||
-promscrape.maxScrapeSize value
|
||||
-promscrape.maxDroppedTargets int
|
||||
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
|
||||
-promscrape.maxScrapeSize size
|
||||
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
|
||||
-promscrape.openstackSDCheckInterval openstack_sd_configs
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
|
||||
-promscrape.openstackSDCheckInterval duration
|
||||
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
|
||||
-promscrape.streamParse stream_parse: true
|
||||
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set stream_parse: true individually per each `scrape_config` section in `-promscrape.config` for fine grained control
|
||||
-promscrape.suppressDuplicateScrapeTargetErrors duplicate scrape target
|
||||
Whether to suppress duplicate scrape target errors; see https://victoriametrics.github.io/vmagent.html#troubleshooting for details
|
||||
-promscrape.streamParse
|
||||
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
|
||||
-promscrape.suppressDuplicateScrapeTargetErrors
|
||||
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
|
||||
-promscrape.suppressScrapeErrors
|
||||
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
|
||||
-remoteWrite.basicAuth.password array
|
||||
Optional basic auth password to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.basicAuth.passwordFile array
|
||||
Optional path to basic auth password to use for -remoteWrite.url. The file is re-read every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.basicAuth.username array
|
||||
Optional basic auth username to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.bearerToken array
|
||||
Optional bearer auth token to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.bearerTokenFile array
|
||||
Optional path to bearer token file to use for -remoteWrite.url. The token is re-read from the file every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.flushInterval duration
|
||||
Interval for flushing the data to remote storage. This option takes effect only when less than 10K data points per second are pushed to -remoteWrite.url (default 1s)
|
||||
-remoteWrite.label array
|
||||
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.maxBlockSize value
|
||||
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.maxBlockSize size
|
||||
The maximum size in bytes of unpacked request to send to remote storage. It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
|
||||
-remoteWrite.maxDiskUsagePerURL value
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
|
||||
-remoteWrite.maxDailySeries int
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -remoteWrite.maxHourlySeries
|
||||
-remoteWrite.maxDiskUsagePerURL size
|
||||
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. Disk usage is unlimited if the value is set to 0
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-remoteWrite.maxHourlySeries int
|
||||
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -remoteWrite.maxDailySeries
|
||||
-remoteWrite.oauth2.clientID array
|
||||
Optional OAuth2 clientID to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.clientSecret array
|
||||
Optional OAuth2 clientSecret to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.clientSecretFile array
|
||||
Optional OAuth2 clientSecretFile to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.scopes array
|
||||
Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.oauth2.tokenUrl array
|
||||
Optional OAuth2 tokenURL to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.proxyURL array
|
||||
Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.queues int
|
||||
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 4)
|
||||
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 2 * numberOfAvailableCPUs)
|
||||
-remoteWrite.rateLimit array
|
||||
Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.relabelConfig string
|
||||
Optional path to file with relabel_config entries. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details
|
||||
Optional path to file with relabel_config entries. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details
|
||||
-remoteWrite.relabelDebug
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs
|
||||
-remoteWrite.roundDigits array
|
||||
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
@@ -614,31 +746,36 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsCAFile array
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsCertFile array
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsInsecureSkipVerify array
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsKeyFile array
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tlsServerName array
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.tmpDataPath string
|
||||
Path to directory where temporary data for remote write component is stored (default "vmagent-remotewrite-data")
|
||||
Path to directory where temporary data for remote write component is stored. See also -remoteWrite.maxDiskUsagePerURL (default "vmagent-remotewrite-data")
|
||||
-remoteWrite.url array
|
||||
Remote storage URL to write data to. It must support Prometheus remote_write API. It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.urlRelabelConfig array
|
||||
Optional path to relabel config for the corresponding -remoteWrite.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-remoteWrite.urlRelabelDebug array
|
||||
Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. This is useful for debugging the relabeling configs
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-sortLabels
|
||||
Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}Enabled sorting for labels can slow down ingestion performance a bit
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
|
||||
@@ -96,7 +96,8 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
|
||||
if !*skipMeasurement {
|
||||
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
|
||||
}
|
||||
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
|
||||
skipFieldKey := len(r.Measurement) > 0 && len(r.Fields) == 1 && *skipSingleField
|
||||
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
|
||||
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/influxutils"
|
||||
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
|
||||
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
|
||||
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
|
||||
@@ -40,7 +41,7 @@ var (
|
||||
"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
|
||||
"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
|
||||
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
|
||||
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<vmagent>:8429/write`")
|
||||
"This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write")
|
||||
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
|
||||
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
|
||||
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
|
||||
@@ -144,7 +145,18 @@ func main() {
|
||||
|
||||
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
if r.URL.Path == "/" {
|
||||
fmt.Fprintf(w, "vmagent - see docs at https://victoriametrics.github.io/vmagent.html")
|
||||
if r.Method != "GET" {
|
||||
return false
|
||||
}
|
||||
fmt.Fprintf(w, "<h2>vmagent</h2>")
|
||||
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/vmagent.html'>https://docs.victoriametrics.com/vmagent.html</a></br>")
|
||||
fmt.Fprintf(w, "Useful endpoints:</br>")
|
||||
httpserver.WriteAPIHelp(w, [][2]string{
|
||||
{"/targets", "discovered targets list"},
|
||||
{"/api/v1/targets", "advanced information about discovered targets in JSON format"},
|
||||
{"/metrics", "available service metrics"},
|
||||
{"/-/reload", "reload configuration"},
|
||||
})
|
||||
return true
|
||||
}
|
||||
path := strings.Replace(r.URL.Path, "//", "/", -1)
|
||||
@@ -153,7 +165,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
prometheusWriteRequests.Inc()
|
||||
if err := promremotewrite.InsertHandler(r); err != nil {
|
||||
prometheusWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -162,7 +174,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
vmimportRequests.Inc()
|
||||
if err := vmimport.InsertHandler(r); err != nil {
|
||||
vmimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -171,7 +183,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
csvimportRequests.Inc()
|
||||
if err := csvimport.InsertHandler(r); err != nil {
|
||||
csvimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -180,7 +192,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
prometheusimportRequests.Inc()
|
||||
if err := prometheusimport.InsertHandler(r); err != nil {
|
||||
prometheusimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -189,7 +201,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
nativeimportRequests.Inc()
|
||||
if err := native.InsertHandler(r); err != nil {
|
||||
nativeimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -198,16 +210,14 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
influxWriteRequests.Inc()
|
||||
if err := influx.InsertHandlerForHTTP(r); err != nil {
|
||||
influxWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return true
|
||||
case "/query":
|
||||
// Emulate fake response for influx query.
|
||||
// This is required for TSBS benchmark.
|
||||
influxQueryRequests.Inc()
|
||||
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
|
||||
influxutils.WriteDatabaseNames(w)
|
||||
return true
|
||||
case "/targets":
|
||||
promscrapeTargetsRequests.Inc()
|
||||
@@ -269,7 +279,7 @@ func usage() {
|
||||
const s = `
|
||||
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vmagent.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmagent.html .
|
||||
`
|
||||
flagutil.Usage(s)
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
@@ -42,17 +40,35 @@ var (
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
basicAuthPasswordFile = flagutil.NewArray("remoteWrite.basicAuth.passwordFile", "Optional path to basic auth password to use for -remoteWrite.url. "+
|
||||
"The file is re-read every second. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
bearerTokenFile = flagutil.NewArray("remoteWrite.bearerTokenFile", "Optional path to bearer token file to use for -remoteWrite.url. "+
|
||||
"The token is re-read from the file every second. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
|
||||
oauth2ClientID = flagutil.NewArray("remoteWrite.oauth2.clientID", "Optional OAuth2 clientID to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
oauth2ClientSecret = flagutil.NewArray("remoteWrite.oauth2.clientSecret", "Optional OAuth2 clientSecret to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
oauth2ClientSecretFile = flagutil.NewArray("remoteWrite.oauth2.clientSecretFile", "Optional OAuth2 clientSecretFile to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
oauth2TokenURL = flagutil.NewArray("remoteWrite.oauth2.tokenUrl", "Optional OAuth2 tokenURL to use for -remoteWrite.url. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
oauth2Scopes = flagutil.NewArray("remoteWrite.oauth2.scopes", "Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. "+
|
||||
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
|
||||
)
|
||||
|
||||
type client struct {
|
||||
sanitizedURL string
|
||||
remoteWriteURL string
|
||||
authHeader string
|
||||
fq *persistentqueue.FastQueue
|
||||
hc *http.Client
|
||||
|
||||
authCfg *promauth.Config
|
||||
|
||||
rl rateLimiter
|
||||
|
||||
bytesSent *metrics.Counter
|
||||
@@ -68,10 +84,11 @@ type client struct {
|
||||
}
|
||||
|
||||
func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
|
||||
tlsCfg, err := getTLSConfig(argIdx)
|
||||
authCfg, err := getAuthConfig(argIdx)
|
||||
if err != nil {
|
||||
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
|
||||
logger.Panicf("FATAL: cannot initialize auth config: %s", err)
|
||||
}
|
||||
tlsCfg := authCfg.NewTLSConfig()
|
||||
tr := &http.Transport{
|
||||
Dial: statDial,
|
||||
TLSClientConfig: tlsCfg,
|
||||
@@ -92,26 +109,10 @@ func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqu
|
||||
}
|
||||
tr.Proxy = http.ProxyURL(urlProxy)
|
||||
}
|
||||
authHeader := ""
|
||||
username := basicAuthUsername.GetOptionalArg(argIdx)
|
||||
password := basicAuthPassword.GetOptionalArg(argIdx)
|
||||
if len(username) > 0 || len(password) > 0 {
|
||||
// See https://en.wikipedia.org/wiki/Basic_access_authentication
|
||||
token := username + ":" + password
|
||||
token64 := base64.StdEncoding.EncodeToString([]byte(token))
|
||||
authHeader = "Basic " + token64
|
||||
}
|
||||
token := bearerToken.GetOptionalArg(argIdx)
|
||||
if len(token) > 0 {
|
||||
if authHeader != "" {
|
||||
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
|
||||
}
|
||||
authHeader = "Bearer " + token
|
||||
}
|
||||
c := &client{
|
||||
sanitizedURL: sanitizedURL,
|
||||
remoteWriteURL: remoteWriteURL,
|
||||
authHeader: authHeader,
|
||||
authCfg: authCfg,
|
||||
fq: fq,
|
||||
hc: &http.Client{
|
||||
Transport: tr,
|
||||
@@ -149,23 +150,48 @@ func (c *client) MustStop() {
|
||||
logger.Infof("stopped client for -remoteWrite.url=%q", c.sanitizedURL)
|
||||
}
|
||||
|
||||
func getTLSConfig(argIdx int) (*tls.Config, error) {
|
||||
c := &promauth.TLSConfig{
|
||||
func getAuthConfig(argIdx int) (*promauth.Config, error) {
|
||||
username := basicAuthUsername.GetOptionalArg(argIdx)
|
||||
password := basicAuthPassword.GetOptionalArg(argIdx)
|
||||
passwordFile := basicAuthPasswordFile.GetOptionalArg(argIdx)
|
||||
var basicAuthCfg *promauth.BasicAuthConfig
|
||||
if username != "" || password != "" || passwordFile != "" {
|
||||
basicAuthCfg = &promauth.BasicAuthConfig{
|
||||
Username: username,
|
||||
Password: password,
|
||||
PasswordFile: passwordFile,
|
||||
}
|
||||
}
|
||||
|
||||
token := bearerToken.GetOptionalArg(argIdx)
|
||||
tokenFile := bearerTokenFile.GetOptionalArg(argIdx)
|
||||
|
||||
var oauth2Cfg *promauth.OAuth2Config
|
||||
clientSecret := oauth2ClientSecret.GetOptionalArg(argIdx)
|
||||
clientSecretFile := oauth2ClientSecretFile.GetOptionalArg(argIdx)
|
||||
if clientSecretFile != "" || clientSecret != "" {
|
||||
oauth2Cfg = &promauth.OAuth2Config{
|
||||
ClientID: oauth2ClientID.GetOptionalArg(argIdx),
|
||||
ClientSecret: clientSecret,
|
||||
ClientSecretFile: clientSecretFile,
|
||||
TokenURL: oauth2TokenURL.GetOptionalArg(argIdx),
|
||||
Scopes: strings.Split(oauth2Scopes.GetOptionalArg(argIdx), ";"),
|
||||
}
|
||||
}
|
||||
|
||||
tlsCfg := &promauth.TLSConfig{
|
||||
CAFile: tlsCAFile.GetOptionalArg(argIdx),
|
||||
CertFile: tlsCertFile.GetOptionalArg(argIdx),
|
||||
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
|
||||
ServerName: tlsServerName.GetOptionalArg(argIdx),
|
||||
InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(argIdx),
|
||||
}
|
||||
if c.CAFile == "" && c.CertFile == "" && c.KeyFile == "" && c.ServerName == "" && !c.InsecureSkipVerify {
|
||||
return nil, nil
|
||||
}
|
||||
cfg, err := promauth.NewConfig(".", nil, "", "", c)
|
||||
|
||||
authCfg, err := promauth.NewConfig(".", nil, basicAuthCfg, token, tokenFile, oauth2Cfg, tlsCfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
|
||||
return nil, fmt.Errorf("cannot populate OAuth2 config for remoteWrite idx: %d, err: %w", argIdx, err)
|
||||
}
|
||||
tlsCfg := cfg.NewTLSConfig()
|
||||
return tlsCfg, nil
|
||||
return authCfg, nil
|
||||
}
|
||||
|
||||
func (c *client) runWorker() {
|
||||
@@ -226,8 +252,8 @@ again:
|
||||
h.Set("Content-Type", "application/x-protobuf")
|
||||
h.Set("Content-Encoding", "snappy")
|
||||
h.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
if c.authHeader != "" {
|
||||
req.Header.Set("Authorization", c.authHeader)
|
||||
if ah := c.authCfg.GetAuthHeader(); ah != "" {
|
||||
req.Header.Set("Authorization", ah)
|
||||
}
|
||||
|
||||
startTime := time.Now()
|
||||
@@ -239,7 +265,7 @@ again:
|
||||
if retryDuration > time.Minute {
|
||||
retryDuration = time.Minute
|
||||
}
|
||||
logger.Errorf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
|
||||
logger.Warnf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
|
||||
len(block), c.sanitizedURL, err, retryDuration.Seconds())
|
||||
t := timerpool.Get(retryDuration)
|
||||
select {
|
||||
@@ -259,13 +285,11 @@ again:
|
||||
return true
|
||||
}
|
||||
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
|
||||
if statusCode == 409 {
|
||||
// Just drop block on 409 status code like Prometheus does.
|
||||
if statusCode == 409 || statusCode == 400 {
|
||||
// Just drop block on 409 and 400 status codes like Prometheus does.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
|
||||
_ = resp.Body.Close()
|
||||
logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+
|
||||
"response body=%q", len(block), c.sanitizedURL, statusCode, body)
|
||||
c.packetsDropped.Inc()
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -27,6 +27,9 @@ var (
|
||||
// the maximum number of rows to send per each block.
|
||||
const maxRowsPerBlock = 10000
|
||||
|
||||
// the maximum number of labels to send per each block.
|
||||
const maxLabelsPerBlock = 10 * maxRowsPerBlock
|
||||
|
||||
type pendingSeries struct {
|
||||
mu sync.Mutex
|
||||
wr writeRequest
|
||||
@@ -153,7 +156,7 @@ func (wr *writeRequest) push(src []prompbmarshal.TimeSeries) {
|
||||
for i := range src {
|
||||
tssDst = append(tssDst, prompbmarshal.TimeSeries{})
|
||||
wr.copyTimeSeries(&tssDst[len(tssDst)-1], &src[i])
|
||||
if len(wr.samples) >= maxRowsPerBlock {
|
||||
if len(wr.samples) >= maxRowsPerBlock || len(wr.labels) >= maxLabelsPerBlock {
|
||||
wr.tss = tssDst
|
||||
wr.flush()
|
||||
tssDst = wr.tss
|
||||
|
||||
@@ -14,10 +14,15 @@ import (
|
||||
|
||||
var (
|
||||
unparsedLabelsGlobal = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
|
||||
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
|
||||
"Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage")
|
||||
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
|
||||
"before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details")
|
||||
"before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details")
|
||||
relabelDebugGlobal = flag.Bool("remoteWrite.relabelDebug", false, "Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. "+
|
||||
"If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs")
|
||||
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
|
||||
relabelDebug = flagutil.NewArrayBool("remoteWrite.urlRelabelDebug", "Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. "+
|
||||
"If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. "+
|
||||
"This is useful for debugging the relabeling configs")
|
||||
)
|
||||
|
||||
var labelsGlobal []prompbmarshal.Label
|
||||
@@ -31,7 +36,7 @@ func CheckRelabelConfigs() error {
|
||||
func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
var rcs relabelConfigs
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal, *relabelDebugGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
@@ -47,7 +52,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
// Skip empty relabel config.
|
||||
continue
|
||||
}
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path, relabelDebug.GetOptionalArg(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
|
||||
}
|
||||
|
||||
@@ -3,9 +3,13 @@ package remotewrite
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
@@ -13,6 +17,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
xxhash "github.com/cespare/xxhash/v2"
|
||||
)
|
||||
@@ -21,9 +26,10 @@ var (
|
||||
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
|
||||
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
|
||||
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
|
||||
queues = flag.Int("remoteWrite.queues", 4, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
|
||||
"isn't enough for sending high volume of collected data to remote storage")
|
||||
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+
|
||||
"See also -remoteWrite.maxDiskUsagePerURL")
|
||||
queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs()*2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
|
||||
"isn't enough for sending high volume of collected data to remote storage. Default value if 2 * numberOfAvailableCPUs")
|
||||
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
|
||||
"It is hidden by default, since it can contain sensitive info such as auth key")
|
||||
maxPendingBytesPerURL = flagutil.NewBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
|
||||
@@ -37,6 +43,14 @@ var (
|
||||
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
|
||||
"By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
|
||||
"This option may be used for improving data compression for the stored metrics")
|
||||
sortLabels = flag.Bool("sortLabels", false, `Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. `+
|
||||
`This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. `+
|
||||
`For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}`+
|
||||
`Enabled sorting for labels can slow down ingestion performance a bit`)
|
||||
maxHourlySeries = flag.Int("remoteWrite.maxHourlySeries", 0, "The maximum number of unique series vmagent can send to remote storage systems during the last hour. "+
|
||||
"Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -remoteWrite.maxDailySeries")
|
||||
maxDailySeries = flag.Int("remoteWrite.maxDailySeries", 0, "The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. "+
|
||||
"Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -remoteWrite.maxHourlySeries")
|
||||
)
|
||||
|
||||
var rwctxs []*remoteWriteCtx
|
||||
@@ -46,7 +60,7 @@ var allRelabelConfigs atomic.Value
|
||||
|
||||
// maxQueues limits the maximum value for `-remoteWrite.queues`. There is no sense in setting too high value,
|
||||
// since it may lead to high memory usage due to big number of buffers.
|
||||
var maxQueues = cgroup.AvailableCPUs() * 4
|
||||
var maxQueues = cgroup.AvailableCPUs() * 16
|
||||
|
||||
// InitSecretFlags must be called after flag.Parse and before any logging.
|
||||
func InitSecretFlags() {
|
||||
@@ -65,6 +79,24 @@ func Init() {
|
||||
if len(*remoteWriteURLs) == 0 {
|
||||
logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set")
|
||||
}
|
||||
if *maxHourlySeries > 0 {
|
||||
hourlySeriesLimiter = bloomfilter.NewLimiter(*maxHourlySeries, time.Hour)
|
||||
_ = metrics.NewGauge(`vmagent_hourly_series_limit_max_series`, func() float64 {
|
||||
return float64(hourlySeriesLimiter.MaxItems())
|
||||
})
|
||||
_ = metrics.NewGauge(`vmagent_hourly_series_limit_current_series`, func() float64 {
|
||||
return float64(hourlySeriesLimiter.CurrentItems())
|
||||
})
|
||||
}
|
||||
if *maxDailySeries > 0 {
|
||||
dailySeriesLimiter = bloomfilter.NewLimiter(*maxDailySeries, 24*time.Hour)
|
||||
_ = metrics.NewGauge(`vmagent_daily_series_limit_max_series`, func() float64 {
|
||||
return float64(dailySeriesLimiter.MaxItems())
|
||||
})
|
||||
_ = metrics.NewGauge(`vmagent_daily_series_limit_current_series`, func() float64 {
|
||||
return float64(dailySeriesLimiter.CurrentItems())
|
||||
})
|
||||
}
|
||||
if *queues > maxQueues {
|
||||
*queues = maxQueues
|
||||
}
|
||||
@@ -72,6 +104,12 @@ func Init() {
|
||||
*queues = 1
|
||||
}
|
||||
initLabelsGlobal()
|
||||
|
||||
// Register SIGHUP handler for config reload before loadRelabelConfigs.
|
||||
// This guarantees that the config will be re-read if the signal arrives just after loadRelabelConfig.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load relabel configs: %s", err)
|
||||
@@ -79,11 +117,11 @@ func Init() {
|
||||
allRelabelConfigs.Store(rcs)
|
||||
|
||||
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
|
||||
if maxInmemoryBlocks > 200 {
|
||||
if maxInmemoryBlocks > 400 {
|
||||
// There is no much sense in keeping higher number of blocks in memory,
|
||||
// since this means that the producer outperforms consumer and the queue
|
||||
// will continue growing. It is better storing the queue to file.
|
||||
maxInmemoryBlocks = 200
|
||||
maxInmemoryBlocks = 400
|
||||
}
|
||||
if maxInmemoryBlocks < 2 {
|
||||
maxInmemoryBlocks = 2
|
||||
@@ -98,7 +136,6 @@ func Init() {
|
||||
}
|
||||
|
||||
// Start config reloader.
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
configReloaderWG.Add(1)
|
||||
go func() {
|
||||
defer configReloaderWG.Done()
|
||||
@@ -150,11 +187,13 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
for len(tss) > 0 {
|
||||
// Process big tss in smaller blocks in order to reduce the maximum memory usage
|
||||
samplesCount := 0
|
||||
labelsCount := 0
|
||||
i := 0
|
||||
for i < len(tss) {
|
||||
samplesCount += len(tss[i].Samples)
|
||||
labelsCount += len(tss[i].Labels)
|
||||
i++
|
||||
if samplesCount > maxRowsPerBlock {
|
||||
if samplesCount >= maxRowsPerBlock || labelsCount >= maxLabelsPerBlock {
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -170,8 +209,12 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, pcsGlobal)
|
||||
globalRelabelMetricsDropped.Add(tssBlockLen - len(tssBlock))
|
||||
}
|
||||
for _, rwctx := range rwctxs {
|
||||
rwctx.Push(tssBlock)
|
||||
sortLabelsIfNeeded(tssBlock)
|
||||
tssBlock = limitSeriesCardinality(tssBlock)
|
||||
if len(tssBlock) > 0 {
|
||||
for _, rwctx := range rwctxs {
|
||||
rwctx.Push(tssBlock)
|
||||
}
|
||||
}
|
||||
if rctx != nil {
|
||||
rctx.reset()
|
||||
@@ -182,6 +225,87 @@ func Push(wr *prompbmarshal.WriteRequest) {
|
||||
}
|
||||
}
|
||||
|
||||
// sortLabelsIfNeeded sorts labels if -sortLabels command-line flag is set.
|
||||
func sortLabelsIfNeeded(tss []prompbmarshal.TimeSeries) {
|
||||
if !*sortLabels {
|
||||
return
|
||||
}
|
||||
for i := range tss {
|
||||
promrelabel.SortLabels(tss[i].Labels)
|
||||
}
|
||||
}
|
||||
|
||||
func limitSeriesCardinality(tss []prompbmarshal.TimeSeries) []prompbmarshal.TimeSeries {
|
||||
if hourlySeriesLimiter == nil && dailySeriesLimiter == nil {
|
||||
return tss
|
||||
}
|
||||
dst := make([]prompbmarshal.TimeSeries, 0, len(tss))
|
||||
for i := range tss {
|
||||
labels := tss[i].Labels
|
||||
h := getLabelsHash(labels)
|
||||
if hourlySeriesLimiter != nil && !hourlySeriesLimiter.Add(h) {
|
||||
hourlySeriesLimitRowsDropped.Add(len(tss[i].Samples))
|
||||
logSkippedSeries(labels, "-remoteWrite.maxHourlySeries", hourlySeriesLimiter.MaxItems())
|
||||
continue
|
||||
}
|
||||
if dailySeriesLimiter != nil && !dailySeriesLimiter.Add(h) {
|
||||
dailySeriesLimitRowsDropped.Add(len(tss[i].Samples))
|
||||
logSkippedSeries(labels, "-remoteWrite.maxDailySeries", dailySeriesLimiter.MaxItems())
|
||||
continue
|
||||
}
|
||||
dst = append(dst, tss[i])
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
var (
|
||||
hourlySeriesLimiter *bloomfilter.Limiter
|
||||
dailySeriesLimiter *bloomfilter.Limiter
|
||||
|
||||
hourlySeriesLimitRowsDropped = metrics.NewCounter(`vmagent_hourly_series_limit_rows_dropped_total`)
|
||||
dailySeriesLimitRowsDropped = metrics.NewCounter(`vmagent_daily_series_limit_rows_dropped_total`)
|
||||
)
|
||||
|
||||
func getLabelsHash(labels []prompbmarshal.Label) uint64 {
|
||||
bb := labelsHashBufPool.Get()
|
||||
b := bb.B[:0]
|
||||
for _, label := range labels {
|
||||
b = append(b, label.Name...)
|
||||
b = append(b, label.Value...)
|
||||
}
|
||||
h := xxhash.Sum64(b)
|
||||
bb.B = b
|
||||
labelsHashBufPool.Put(bb)
|
||||
return h
|
||||
}
|
||||
|
||||
var labelsHashBufPool bytesutil.ByteBufferPool
|
||||
|
||||
func logSkippedSeries(labels []prompbmarshal.Label, flagName string, flagValue int) {
|
||||
select {
|
||||
case <-logSkippedSeriesTicker.C:
|
||||
logger.Warnf("skip series %s because %s=%d reached", labelsToString(labels), flagName, flagValue)
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
var logSkippedSeriesTicker = time.NewTicker(5 * time.Second)
|
||||
|
||||
func labelsToString(labels []prompbmarshal.Label) string {
|
||||
var b []byte
|
||||
b = append(b, '{')
|
||||
for i, label := range labels {
|
||||
b = append(b, label.Name...)
|
||||
b = append(b, '=')
|
||||
b = strconv.AppendQuote(b, label.Value)
|
||||
if i+1 < len(labels) {
|
||||
b = append(b, ',')
|
||||
}
|
||||
}
|
||||
b = append(b, '}')
|
||||
return string(b)
|
||||
}
|
||||
|
||||
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
|
||||
|
||||
type remoteWriteCtx struct {
|
||||
@@ -207,7 +331,13 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int,
|
||||
c := newClient(argIdx, remoteWriteURL, sanitizedURL, fq, *queues)
|
||||
sf := significantFigures.GetOptionalArgOrDefault(argIdx, 0)
|
||||
rd := roundDigits.GetOptionalArgOrDefault(argIdx, 100)
|
||||
pss := make([]*pendingSeries, *queues)
|
||||
pssLen := *queues
|
||||
if n := cgroup.AvailableCPUs(); pssLen > n {
|
||||
// There is no sense in running more than availableCPUs concurrent pendingSeries,
|
||||
// since every pendingSeries can saturate up to a single CPU.
|
||||
pssLen = n
|
||||
}
|
||||
pss := make([]*pendingSeries, pssLen)
|
||||
for i := range pss {
|
||||
pss[i] = newPendingSeries(fq.MustWriteBlock, sf, rd)
|
||||
}
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
package remotewrite
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@@ -11,13 +9,8 @@ import (
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
func statDial(network, addr string) (conn net.Conn, err error) {
|
||||
if !strings.HasPrefix(network, "tcp") {
|
||||
return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network)
|
||||
}
|
||||
if !netutil.TCP6Enabled() {
|
||||
network = "tcp4"
|
||||
}
|
||||
func statDial(networkUnused, addr string) (conn net.Conn, err error) {
|
||||
network := netutil.GetTCPNetwork()
|
||||
conn, err = net.DialTimeout(network, addr, 5*time.Second)
|
||||
dialsTotal.Inc()
|
||||
if err != nil {
|
||||
|
||||
@@ -66,7 +66,17 @@ run-vmalert: vmalert
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-external.label=cluster=east-1 \
|
||||
-external.label=replica=a \
|
||||
-evaluationInterval=3s
|
||||
-evaluationInterval=3s \
|
||||
-rule.configCheckInterval=10s
|
||||
|
||||
replay-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules-replay-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-external.label=cluster=east-1 \
|
||||
-external.label=replica=a \
|
||||
-replay.timeFrom=2021-05-11T07:21:43Z \
|
||||
-replay.timeTo=2021-05-29T18:40:43Z
|
||||
|
||||
vmalert-amd64:
|
||||
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmalert-local-with-goarch
|
||||
@@ -88,3 +98,9 @@ vmalert-local-with-goarch:
|
||||
|
||||
vmalert-pure:
|
||||
APP_NAME=vmalert $(MAKE) app-local-pure
|
||||
|
||||
vmalert-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmalert $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmalert-windows-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -1,22 +1,24 @@
|
||||
## vmalert
|
||||
# vmalert
|
||||
|
||||
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
`vmalert` executes a list of the given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
|
||||
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
rules against configured address.
|
||||
rules against configured address. It is heavily inspired by [Prometheus](https://prometheus.io/docs/alerting/latest/overview/)
|
||||
implementation and aims to be compatible with its syntax.
|
||||
|
||||
### Features:
|
||||
## Features
|
||||
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
|
||||
* VictoriaMetrics [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
|
||||
* VictoriaMetrics [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html)
|
||||
support and expressions validation;
|
||||
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
|
||||
support;
|
||||
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
|
||||
* Keeps the alerts [state on restarts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmalert#alerts-state-on-restarts);
|
||||
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite) for details.
|
||||
* Keeps the alerts [state on restarts](#alerts-state-on-restarts);
|
||||
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite);
|
||||
* Recording and Alerting rules backfilling (aka `replay`). See [these docs](#rules-backfilling);
|
||||
* Lightweight without extra dependencies.
|
||||
|
||||
### Limitations:
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
## Limitations
|
||||
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
|
||||
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
|
||||
may fail;
|
||||
* by default, rules execution is sequential within one group, but persisting of execution results to remote
|
||||
@@ -24,7 +26,7 @@ storage is asynchronous. Hence, user shouldn't rely on recording rules chaining
|
||||
recording rule is reused in next one;
|
||||
* `vmalert` has no UI, just an API for getting groups and rules statuses.
|
||||
|
||||
### QuickStart
|
||||
## QuickStart
|
||||
|
||||
To build `vmalert` from sources:
|
||||
```
|
||||
@@ -37,75 +39,87 @@ The build binary will be placed to `VictoriaMetrics/bin` folder.
|
||||
To start using `vmalert` you will need the following things:
|
||||
* list of rules - PromQL/MetricsQL expressions to execute;
|
||||
* datasource address - reachable VictoriaMetrics instance for rules execution;
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
|
||||
aggregating alerts and sending notifications.
|
||||
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
|
||||
* remote write address [optional] - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
|
||||
compatible storage address for storing recording rules results and alerts state in for of timeseries.
|
||||
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
./bin/vmalert -rule=alert.rules \
|
||||
./bin/vmalert -rule=alert.rules \ # Path to the file with rules configuration. Supports wildcard
|
||||
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
|
||||
-notifier.url=http://localhost:9093 \ # AlertManager URL
|
||||
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
|
||||
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
|
||||
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
|
||||
-remoteWrite.url=http://localhost:8428 \ # Remote write compatible storage to persist rules
|
||||
-remoteRead.url=http://localhost:8428 \ # MetricsQL compatible datasource to restore alerts state from
|
||||
-external.label=cluster=east-1 \ # External label to be applied for each rule
|
||||
-external.label=replica=a \ # Multiple external labels may be set
|
||||
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
|
||||
-external.label=replica=a # Multiple external labels may be set
|
||||
```
|
||||
|
||||
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
|
||||
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
|
||||
See the fill list of configuration flags in [configuration](#configuration) section.
|
||||
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
|
||||
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
|
||||
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
|
||||
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
|
||||
Every `rule` belongs to a `group` and every configuration file may contain arbitrary number of groups:
|
||||
```yaml
|
||||
groups:
|
||||
[ - <rule_group> ]
|
||||
```
|
||||
|
||||
#### Groups
|
||||
### Groups
|
||||
|
||||
Each group has following attributes:
|
||||
Each group has the following attributes:
|
||||
```yaml
|
||||
# The name of the group. Must be unique within a file.
|
||||
name: <string>
|
||||
|
||||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
[ interval: <duration> | default = -evaluationInterval flag ]
|
||||
|
||||
# How many rules execute at once. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
# How many rules execute at once within a group. Increasing concurrency may speed
|
||||
# up round execution speed.
|
||||
[ concurrency: <integer> | default = 1 ]
|
||||
|
||||
# Optional type for expressions inside the rules. Supported values: "graphite" and "prometheus".
|
||||
# By default "prometheus" rule type is used.
|
||||
[ type: <string> ]
|
||||
|
||||
# Optional list of label filters applied to every rule's
|
||||
# request withing a group. Is compatible only with VM datasource.
|
||||
# See more details at https://docs.victoriametrics.com#prometheus-querying-api-enhancements
|
||||
extra_filter_labels:
|
||||
[ <labelname>: <labelvalue> ... ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
||||
#### Rules
|
||||
### Rules
|
||||
|
||||
Every rule contains `expr` field for [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/)
|
||||
or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expression. Vmalert will execute the configured
|
||||
expression and then act according to the Rule type.
|
||||
|
||||
There are two types of Rules:
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
|
||||
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allow you to precompute frequently needed or computationally expensive expressions
|
||||
and save their result as a new set of time series.
|
||||
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
|
||||
Alerting rules allows to define alert conditions via `expr` field and to send notifications
|
||||
[Alertmanager](https://github.com/prometheus/alertmanager) if execution result is not empty.
|
||||
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
|
||||
Recording rules allows to define `expr` which result will be than backfilled to configured
|
||||
`-remoteWrite.url`. Recording rules are used to precompute frequently needed or computationally
|
||||
expensive expressions and save their result as a new set of time series.
|
||||
|
||||
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
|
||||
within one group.
|
||||
within one group.
|
||||
|
||||
##### Alerting rules
|
||||
#### Alerting rules
|
||||
|
||||
The syntax for alerting rule is following:
|
||||
The syntax for alerting rule is the following:
|
||||
```yaml
|
||||
# The name of the alert. Must be a valid metric name.
|
||||
alert: <string>
|
||||
@@ -115,12 +129,14 @@ alert: <string>
|
||||
[ type: <string> ]
|
||||
|
||||
# The expression to evaluate. The expression language depends on the type value.
|
||||
# By default MetricsQL expression is used. If type="graphite", then the expression
|
||||
# By default PromQL/MetricsQL expression is used. If type="graphite", then the expression
|
||||
# must contain valid Graphite expression.
|
||||
expr: <string>
|
||||
|
||||
# Alerts are considered firing once they have been returned for this long.
|
||||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
# If param is omitted or set to 0 then alerts will be immediately considered
|
||||
# as firing once they return.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
@@ -130,9 +146,14 @@ labels:
|
||||
# Annotations to add to each alert.
|
||||
annotations:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
```
|
||||
|
||||
##### Recording rules
|
||||
It is allowed to use [Go templating](https://golang.org/pkg/text/template/) in annotations
|
||||
to format data, iterate over it or execute expressions.
|
||||
Additionally, `vmalert` provides some extra templating functions
|
||||
listed [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/notifier/template_func.go).
|
||||
|
||||
#### Recording rules
|
||||
|
||||
The syntax for recording rules is following:
|
||||
```yaml
|
||||
@@ -153,37 +174,77 @@ labels:
|
||||
[ <labelname>: <labelvalue> ]
|
||||
```
|
||||
|
||||
For recording rules to work `-remoteWrite.url` must specified.
|
||||
For recording rules to work `-remoteWrite.url` must be specified.
|
||||
|
||||
|
||||
#### Alerts state on restarts
|
||||
### Alerts state on restarts
|
||||
|
||||
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert`
|
||||
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after restart of `vmalert`
|
||||
the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags:
|
||||
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or VMInsert (Cluster). `vmalert` will persist alerts state
|
||||
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
|
||||
These are regular time series and may be queried from VM just as any other time series.
|
||||
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state
|
||||
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
|
||||
These are regular time series and may be queried from VM just as any other time series.
|
||||
The state stored to the configured address on every rule evaluation.
|
||||
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or VMSelect (Cluster). `vmalert` will try to restore alerts state
|
||||
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or vmselect (Cluster). `vmalert` will try to restore alerts state
|
||||
from configured address by querying time series with name `ALERTS_FOR_STATE`.
|
||||
|
||||
Both flags are required for the proper state restoring. Restore process may fail if time series are missing
|
||||
in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert`
|
||||
rules configuration.
|
||||
in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`)
|
||||
or received state doesn't match current `vmalert` rules configuration.
|
||||
|
||||
|
||||
#### WEB
|
||||
### Multitenancy
|
||||
|
||||
There are the following approaches for alerting and recording rules across
|
||||
[multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy):
|
||||
|
||||
* To run a separate `vmalert` instance per each tenant.
|
||||
The corresponding tenant must be specified in `-datasource.url` command-line flag
|
||||
according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format).
|
||||
For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus`
|
||||
would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line
|
||||
flag must contain the url for the specific tenant as well.
|
||||
For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording
|
||||
rules to `AccountID=123`.
|
||||
|
||||
* To specify `tenant` parameter per each alerting and recording group if
|
||||
[enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used
|
||||
with `-clusterMode` command-line flag. For example:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: rules_for_tenant_123
|
||||
tenant: "123"
|
||||
rules:
|
||||
# Rules for accountID=123
|
||||
|
||||
- name: rules_for_tenant_456:789
|
||||
tenant: "456:789"
|
||||
rules:
|
||||
# Rules for accountID=456, projectID=789
|
||||
```
|
||||
|
||||
If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must
|
||||
contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481`.
|
||||
`vmselect` automatically adds the specified tenant to urls per each recording rule in this case.
|
||||
|
||||
The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files
|
||||
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise`
|
||||
tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
|
||||
|
||||
|
||||
### WEB
|
||||
|
||||
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
|
||||
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
|
||||
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
|
||||
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
|
||||
* `http://<vmalert-addr>/api/v1/<groupID>/<alertID>/status" ` - get alert status by ID.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
||||
### Graphite
|
||||
## Graphite
|
||||
|
||||
vmalert sends requests to `<-datasource.url>/render?format=json` during evaluation of alerting and recording rules
|
||||
if the corresponding group or rule contains `type: "graphite"` config option. It is expected that the `<-datasource.url>/render`
|
||||
@@ -191,229 +252,335 @@ implements [Graphite Render API](https://graphite.readthedocs.io/en/stable/rende
|
||||
When using vmalert with both `graphite` and `prometheus` rules configured against cluster version of VM do not forget
|
||||
to set `-datasource.appendTypePrefix` flag to `true`, so vmalert can adjust URL prefix automatically based on query type.
|
||||
|
||||
## Rules backfilling
|
||||
|
||||
### Configuration
|
||||
vmalert supports alerting and recording rules backfilling (aka `replay`). In replay mode vmalert
|
||||
can read the same rules configuration as normally, evaluate them on the given time range and backfill
|
||||
results via remote write to the configured storage. vmalert supports any PromQL/MetricsQL compatible
|
||||
data source for backfilling.
|
||||
|
||||
### How it works
|
||||
|
||||
In `replay` mode vmalert works as a cli-tool and exits immediately after work is done.
|
||||
To run vmalert in `replay` mode:
|
||||
```
|
||||
./bin/vmalert -rule=path/to/your.rules \ # path to files with rules you usually use with vmalert
|
||||
-datasource.url=http://localhost:8428 \ # PromQL/MetricsQL compatible datasource
|
||||
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist results
|
||||
-replay.timeFrom=2021-05-11T07:21:43Z \ # time from begin replay
|
||||
-replay.timeTo=2021-05-29T18:40:43Z # time to finish replay
|
||||
```
|
||||
|
||||
The output of the command will look like the following:
|
||||
```
|
||||
Replay mode:
|
||||
from: 2021-05-11 07:21:43 +0000 UTC # set by -replay.timeFrom
|
||||
to: 2021-05-29 18:40:43 +0000 UTC # set by -replay.timeTo
|
||||
max data points per request: 1000 # set by -replay.maxDatapointsPerQuery
|
||||
|
||||
Group "ReplayGroup"
|
||||
interval: 1m0s
|
||||
requests to make: 27
|
||||
max range per request: 16h40m0s
|
||||
> Rule "type:vm_cache_entries:rate5m" (ID: 1792509946081842725)
|
||||
27 / 27 [----------------------------------------------------------------------------------------------------] 100.00% 78 p/s
|
||||
> Rule "go_cgo_calls_count:rate5m" (ID: 17958425467471411582)
|
||||
27 / 27 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
|
||||
|
||||
Group "vmsingleReplay"
|
||||
interval: 30s
|
||||
requests to make: 54
|
||||
max range per request: 8h20m0s
|
||||
> Rule "RequestErrorsToAPI" (ID: 17645863024999990222)
|
||||
54 / 54 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
|
||||
> Rule "TooManyLogs" (ID: 9042195394653477652)
|
||||
54 / 54 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
|
||||
2021-06-07T09:59:12.098Z info app/vmalert/replay.go:68 replay finished! Imported 511734 samples
|
||||
```
|
||||
|
||||
In `replay` mode all groups are executed sequentially one-by-one. Rules within the group are
|
||||
executed sequentially as well (`concurrency` setting is ignored). Vmalert sends rule's expression
|
||||
to [/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) endpoint
|
||||
of the configured `-datasource.url`. Returned data then processed according to the rule type and
|
||||
backfilled to `-remoteWrite.url` via [Remote Write protocol](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations).
|
||||
Vmalert respects `evaluationInterval` value set by flag or per-group during the replay.
|
||||
|
||||
#### Recording rules
|
||||
|
||||
Result of recording rules `replay` should match with results of normal rules evaluation.
|
||||
|
||||
#### Alerting rules
|
||||
|
||||
Result of alerting rules `replay` is time series reflecting [alert's state](#alerts-state-on-restarts).
|
||||
To see if `replayed` alert has fired in the past use the following PromQL/MetricsQL expression:
|
||||
```
|
||||
ALERTS{alertname="your_alertname", alertstate="firing"}
|
||||
```
|
||||
Execute the query against storage which was used for `-remoteWrite.url` during the `replay`.
|
||||
|
||||
### Additional configuration
|
||||
|
||||
There are following non-required `replay` flags:
|
||||
|
||||
* `-replay.maxDatapointsPerQuery` - the max number of data points expected to receive in one request.
|
||||
In two words, it affects the max time range for every `/query_range` request. The higher the value,
|
||||
the less requests will be issued during `replay`.
|
||||
* `-replay.ruleRetryAttempts` - when datasource fails to respond vmalert will make this number of retries
|
||||
per rule before giving up.
|
||||
* `-replay.rulesDelay` - delay between sequential rules execution. Important in cases if there are chaining
|
||||
(rules which depend on each other) rules. It is expected, that remote storage will be able to persist
|
||||
previously accepted data during the delay, so data will be available for the subsequent queries.
|
||||
Keep it equal or bigger than `-remoteWrite.flushInterval`.
|
||||
|
||||
See full description for these flags in `./vmalert --help`.
|
||||
|
||||
### Limitations
|
||||
|
||||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
|
||||
The shortlist of configuration flags is the following:
|
||||
```
|
||||
-datasource.appendTypePrefix
|
||||
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to VMSelect URL.
|
||||
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
|
||||
-datasource.basicAuth.password string
|
||||
Optional basic auth password for -datasource.url
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.lookback duration
|
||||
Lookback defines how far to look into past when evaluating queries. For example, if datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
-datasource.maxIdleConnections int
|
||||
Defines the number of idle (keep-alive connections) to configured datasource.Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state. (default 100)
|
||||
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
|
||||
-datasource.queryStep duration
|
||||
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.
|
||||
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.If queryStep isn't specified, rule's evaluationInterval will be used instead.
|
||||
-datasource.roundDigits int
|
||||
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
|
||||
-datasource.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
|
||||
-datasource.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
|
||||
-datasource.tlsServerName string
|
||||
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used
|
||||
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
|
||||
-datasource.url string
|
||||
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-dryRun -rule
|
||||
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
|
||||
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-evaluationInterval duration
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.alert.source string
|
||||
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
|
||||
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
|
||||
-external.label array
|
||||
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
Address to listen for http connections (default ":8880")
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
|
||||
-memory.allowedBytes value
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-notifier.basicAuth.password array
|
||||
Optional basic auth password for -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth password for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.basicAuth.username array
|
||||
Optional basic auth username for -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional basic auth username for -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsCAFile array
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsCertFile array
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsInsecureSkipVerify array
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Whether to skip tls verification when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsKeyFile array
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.tlsServerName array
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-notifier.url array
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It overrides httpAuth settings
|
||||
Auth key for /debug/pprof. It overrides httpAuth settings
|
||||
-remoteRead.basicAuth.password string
|
||||
Optional basic auth password for -remoteRead.url
|
||||
Optional basic auth password for -remoteRead.url
|
||||
-remoteRead.basicAuth.username string
|
||||
Optional basic auth username for -remoteRead.url
|
||||
Optional basic auth username for -remoteRead.url
|
||||
-remoteRead.ignoreRestoreErrors
|
||||
Whether to ignore errors from remote storage when restoring alerts state on startup. (default true)
|
||||
-remoteRead.lookback duration
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
|
||||
-remoteRead.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
|
||||
-remoteRead.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
|
||||
-remoteRead.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
Whether to skip tls verification when connecting to -remoteRead.url
|
||||
-remoteRead.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
|
||||
-remoteRead.tlsServerName string
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
|
||||
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
|
||||
-remoteRead.url vmalert
|
||||
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
|
||||
-remoteWrite.basicAuth.password string
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
Optional basic auth password for -remoteWrite.url
|
||||
-remoteWrite.basicAuth.username string
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
Optional basic auth username for -remoteWrite.url
|
||||
-remoteWrite.concurrency int
|
||||
Defines number of writers for concurrent writing into remote querier (default 1)
|
||||
Defines number of writers for concurrent writing into remote querier (default 1)
|
||||
-remoteWrite.flushInterval duration
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
Defines interval of flushes to remote write endpoint (default 5s)
|
||||
-remoteWrite.maxBatchSize int
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
Defines defines max number of timeseries to be flushed at once (default 1000)
|
||||
-remoteWrite.maxQueueSize int
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
Defines the max number of pending datapoints to remote write endpoint (default 100000)
|
||||
-remoteWrite.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
|
||||
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
|
||||
-remoteWrite.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
|
||||
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
Whether to skip tls verification when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
|
||||
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
|
||||
-remoteWrite.tlsServerName string
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
|
||||
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
|
||||
-remoteWrite.url string
|
||||
Optional URL to Victoria Metrics or VMInsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
|
||||
-replay.maxDatapointsPerQuery int
|
||||
Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000)
|
||||
-replay.ruleRetryAttempts int
|
||||
Defines how many retries to make before giving up on rule if request for it returns an error. (default 5)
|
||||
-replay.rulesDelay duration
|
||||
Delay between rules evaluation within the group. Could be important if there are chained rules inside of the groupand processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule. Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
|
||||
-replay.timeFrom string
|
||||
The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
-replay.timeTo string
|
||||
The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'
|
||||
-rule array
|
||||
Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule="/path/to/file". Path to a single file with alerting rules
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
|
||||
Supports array of values separated by comma or specified via multiple flags.
|
||||
Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule="/path/to/file". Path to a single file with alerting rules
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-rule.configCheckInterval duration
|
||||
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
Whether to validate annotation and label templates (default true)
|
||||
Whether to validate annotation and label templates (default true)
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
Show VictoriaMetrics version
|
||||
```
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
command-line flags with their descriptions.
|
||||
`vmalert` supports "hot" config reload via the following methods:
|
||||
* send SIGHUP signal to `vmalert` process;
|
||||
* send GET request to `/-/reload` endpoint;
|
||||
* configure `-rule.configCheckInterval` flag for periodic reload
|
||||
on config change.
|
||||
|
||||
To reload configuration without `vmalert` restart send SIGHUP signal
|
||||
or send GET request to `/-/reload` endpoint.
|
||||
|
||||
### Contributing
|
||||
## Contributing
|
||||
|
||||
`vmalert` is mostly designed and built by VictoriaMetrics community.
|
||||
Feel free to share your experience and ideas for improving this
|
||||
Feel free to share your experience and ideas for improving this
|
||||
software. Please keep simplicity as the main priority.
|
||||
|
||||
### How to build from sources
|
||||
## How to build from sources
|
||||
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
It is recommended using
|
||||
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
|
||||
- `vmalert` is located in `vmutils-*` archives there.
|
||||
|
||||
|
||||
#### Development build
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmalert` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert` binary and puts it into the `bin` folder.
|
||||
|
||||
#### Production build
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-prod` from the root folder of the repository.
|
||||
2. Run `make vmalert-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
|
||||
#### ARM build
|
||||
### ARM build
|
||||
|
||||
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
|
||||
|
||||
#### Development ARM build
|
||||
### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert-arm` or `vmalert-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
#### Production ARM build
|
||||
### Production ARM build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of the repository.
|
||||
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmalert-arm-prod` or `vmalert-arm64-prod` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
@@ -19,15 +19,18 @@ import (
|
||||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
Type datasource.Type
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
GroupName string
|
||||
Type datasource.Type
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
GroupName string
|
||||
EvalInterval time.Duration
|
||||
|
||||
q datasource.Querier
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
@@ -49,19 +52,25 @@ type alertingRuleMetrics struct {
|
||||
active *gauge
|
||||
}
|
||||
|
||||
func newAlertingRule(group *Group, cfg config.Rule) *AlertingRule {
|
||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
ar := &AlertingRule{
|
||||
Type: cfg.Type,
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For.Duration(),
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: group.ID(),
|
||||
GroupName: group.Name,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
metrics: &alertingRuleMetrics{},
|
||||
Type: cfg.Type,
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For.Duration(),
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: group.ID(),
|
||||
GroupName: group.Name,
|
||||
EvalInterval: group.Interval,
|
||||
q: qb.BuildWithParams(datasource.QuerierParams{
|
||||
DataSourceType: &cfg.Type,
|
||||
EvaluationInterval: group.Interval,
|
||||
ExtraLabels: group.ExtraFilterLabels,
|
||||
}),
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
metrics: &alertingRuleMetrics{},
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||
@@ -119,10 +128,67 @@ func (ar *AlertingRule) ID() uint64 {
|
||||
return ar.RuleID
|
||||
}
|
||||
|
||||
// ExecRange executes alerting rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
||||
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
series, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var result []prompbmarshal.TimeSeries
|
||||
qFn := func(query string) ([]datasource.Metric, error) {
|
||||
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
||||
}
|
||||
for _, s := range series {
|
||||
// extra labels could contain templates, so we expand them first
|
||||
labels, err := expandLabels(s, qFn, ar)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
}
|
||||
for k, v := range labels {
|
||||
// apply extra labels to datasource
|
||||
// so the hash key will be consistent on restore
|
||||
s.SetLabel(k, v)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create alert: %s", err)
|
||||
}
|
||||
if ar.For == 0 { // if alert is instant
|
||||
a.State = notifier.StateFiring
|
||||
for i := range s.Values {
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// if alert with For > 0
|
||||
prevT := time.Time{}
|
||||
//activeAt := time.Time{}
|
||||
for i := range s.Values {
|
||||
at := time.Unix(s.Timestamps[i], 0)
|
||||
if at.Sub(prevT) > ar.EvalInterval {
|
||||
// reset to Pending if there are gaps > EvalInterval between DPs
|
||||
a.State = notifier.StatePending
|
||||
//activeAt = at
|
||||
a.Start = at
|
||||
} else if at.Sub(a.Start) >= ar.For {
|
||||
a.State = notifier.StateFiring
|
||||
}
|
||||
prevT = at
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := q.Query(ctx, ar.Expr, ar.Type)
|
||||
func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := ar.q.Query(ctx, ar.Expr)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
|
||||
@@ -139,7 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
|
||||
}
|
||||
}
|
||||
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
||||
updated := make(map[uint64]struct{})
|
||||
// update list of active alerts
|
||||
for _, m := range qMetrics {
|
||||
@@ -161,9 +227,9 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
|
||||
}
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
if a.Value != m.Value {
|
||||
if a.Value != m.Values[0] {
|
||||
// update Value field with latest value
|
||||
a.Value = m.Value
|
||||
a.Value = m.Values[0]
|
||||
// and re-exec template since Value can be used
|
||||
// in annotations
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
|
||||
@@ -201,10 +267,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
|
||||
alertsFired.Inc()
|
||||
}
|
||||
}
|
||||
if series {
|
||||
return ar.toTimeSeries(ar.lastExecTime), nil
|
||||
}
|
||||
return nil, nil
|
||||
return ar.toTimeSeries(ar.lastExecTime.Unix()), nil
|
||||
}
|
||||
|
||||
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
|
||||
@@ -214,13 +277,13 @@ func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (ma
|
||||
}
|
||||
tpl := notifier.AlertTplData{
|
||||
Labels: metricLabels,
|
||||
Value: m.Value,
|
||||
Value: m.Values[0],
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
return notifier.ExecTemplate(q, ar.Labels, tpl)
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
@@ -244,6 +307,8 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
ar.For = nr.For
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
ar.q = nr.q
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -271,13 +336,15 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notif
|
||||
GroupID: ar.GroupID,
|
||||
Name: ar.Name,
|
||||
Labels: map[string]string{},
|
||||
Value: m.Value,
|
||||
Value: m.Values[0],
|
||||
Start: start,
|
||||
Expr: ar.Expr,
|
||||
}
|
||||
// label defined here to make override possible by
|
||||
// time series labels.
|
||||
a.Labels[alertGroupNameLabel] = ar.GroupName
|
||||
if ar.GroupName != "" {
|
||||
a.Labels[alertGroupNameLabel] = ar.GroupName
|
||||
}
|
||||
for _, l := range m.Labels {
|
||||
// drop __name__ to be consistent with Prometheus alerting
|
||||
if l.Name == "__name__" {
|
||||
@@ -366,7 +433,7 @@ const (
|
||||
)
|
||||
|
||||
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
|
||||
if ar.For > 0 {
|
||||
@@ -375,7 +442,7 @@ func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time
|
||||
return tss
|
||||
}
|
||||
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
func alertToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
@@ -383,19 +450,19 @@ func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prom
|
||||
labels["__name__"] = alertMetricName
|
||||
labels[alertNameLabel] = name
|
||||
labels[alertStateLabel] = a.State.String()
|
||||
return newTimeSeries(1, labels, timestamp)
|
||||
return newTimeSeries([]float64{1}, []int64{timestamp}, labels)
|
||||
}
|
||||
|
||||
// alertForToTimeSeries returns a timeseries that represents
|
||||
// state of active alerts, where value is time when alert become active
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for k, v := range a.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
labels["__name__"] = alertForStateMetricName
|
||||
labels[alertNameLabel] = name
|
||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
return newTimeSeries([]float64{float64(a.Start.Unix())}, []int64{timestamp}, labels)
|
||||
}
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
@@ -407,7 +474,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
|
||||
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
||||
|
||||
// account for external labels in filter
|
||||
var labelsFilter string
|
||||
@@ -420,7 +487,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
|
||||
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr, ar.Type)
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -437,7 +504,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||
m.Labels = append(m.Labels, l)
|
||||
}
|
||||
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0), qFn)
|
||||
a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create alert: %w", err)
|
||||
}
|
||||
|
||||
@@ -24,11 +24,11 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
newTestAlertingRule("instant", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -38,13 +38,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
"instance": "bar",
|
||||
}},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant extra labels",
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -54,48 +54,52 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
||||
"__name__": "bar",
|
||||
}},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "instant labels override",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for", time.Second),
|
||||
¬ifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StateFiring.String(),
|
||||
alertNameLabel: "for",
|
||||
}, timestamp),
|
||||
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for",
|
||||
}, timestamp),
|
||||
}),
|
||||
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
|
||||
[]int64{timestamp.UnixNano()},
|
||||
map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for",
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for pending", 10*time.Second),
|
||||
¬ifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
alertStateLabel: notifier.StatePending.String(),
|
||||
alertNameLabel: "for pending",
|
||||
}, timestamp),
|
||||
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for pending",
|
||||
}, timestamp),
|
||||
}),
|
||||
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
|
||||
[]int64{timestamp.UnixNano()},
|
||||
map[string]string{
|
||||
"__name__": alertForStateMetricName,
|
||||
alertNameLabel: "for pending",
|
||||
}),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
tc.rule.alerts[tc.alert.ID] = tc.alert
|
||||
tss := tc.rule.toTimeSeries(timestamp)
|
||||
tss := tc.rule.toTimeSeries(timestamp.Unix())
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
@@ -118,7 +122,7 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
{
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[][]datasource.Metric{
|
||||
{datasource.Metric{}},
|
||||
{datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}},
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(datasource.Metric{}): {State: notifier.StateFiring},
|
||||
@@ -294,11 +298,12 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO()); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
@@ -320,6 +325,166 @@ func TestAlertingRule_Exec(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertingRule_ExecRange(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
data []datasource.Metric
|
||||
expAlerts []*notifier.Alert
|
||||
}{
|
||||
{
|
||||
newTestAlertingRule("empty", 0),
|
||||
[]datasource.Metric{},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("empty labels", 0),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1}, Timestamps: []int64{1}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[]datasource.Metric{
|
||||
metricWithLabels(t, "name", "foo"),
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{
|
||||
Labels: map[string]string{"name": "foo"},
|
||||
State: notifier.StateFiring,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing-on-range", 0),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StateFiring},
|
||||
{State: notifier.StateFiring},
|
||||
{State: notifier.StateFiring},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending", time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(3, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-firing", 3*time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(5, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(20, 0)},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
|
||||
},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0)},
|
||||
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
|
||||
//
|
||||
{State: notifier.StatePending, Start: time.Unix(1, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
{State: notifier.StatePending, Start: time.Unix(5, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestRuleWithLabels("multi-series-firing", "source", "vm"),
|
||||
[]datasource.Metric{
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 100}},
|
||||
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
|
||||
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
|
||||
},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"source": "vm",
|
||||
}},
|
||||
{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"source": "vm",
|
||||
}},
|
||||
//
|
||||
{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"source": "vm",
|
||||
}},
|
||||
{State: notifier.StateFiring, Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"source": "vm",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
fakeGroup := Group{Name: "TestRule_ExecRange"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.data...)
|
||||
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
var expTS []prompbmarshal.TimeSeries
|
||||
var j int
|
||||
for _, series := range tc.data {
|
||||
for _, timestamp := range series.Timestamps {
|
||||
expTS = append(expTS, tc.rule.alertToTimeSeries(tc.expAlerts[j], timestamp)...)
|
||||
j++
|
||||
}
|
||||
}
|
||||
if len(gotTS) != len(expTS) {
|
||||
t.Fatalf("expected %d time series; got %d", len(expTS), len(gotTS))
|
||||
}
|
||||
for i := range expTS {
|
||||
got, exp := gotTS[i], expTS[i]
|
||||
if !reflect.DeepEqual(got, exp) {
|
||||
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertingRule_Restore(t *testing.T) {
|
||||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
@@ -410,6 +575,7 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
tc.rule.q = fq
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
@@ -437,17 +603,18 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
ar := newTestAlertingRule("test", 0)
|
||||
ar.Labels = map[string]string{"job": "test"}
|
||||
ar.q = fq
|
||||
|
||||
// successful attempt
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.Exec(context.TODO(), fq, false)
|
||||
_, err := ar.Exec(context.TODO())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// label `job` will collide with rule extra label and will make both time series equal
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.Exec(context.TODO(), fq, false)
|
||||
_, err = ar.Exec(context.TODO())
|
||||
if !errors.Is(err, errDuplicate) {
|
||||
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
|
||||
}
|
||||
@@ -456,7 +623,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
_, err = ar.Exec(context.TODO(), fq, false)
|
||||
_, err = ar.Exec(context.TODO())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -481,17 +648,15 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
|
||||
Annotations: map[string]string{},
|
||||
Labels: map[string]string{
|
||||
alertGroupNameLabel: "",
|
||||
"region": "east",
|
||||
"instance": "foo",
|
||||
"region": "east",
|
||||
"instance": "foo",
|
||||
},
|
||||
},
|
||||
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
|
||||
Annotations: map[string]string{},
|
||||
Labels: map[string]string{
|
||||
alertGroupNameLabel: "",
|
||||
"region": "east",
|
||||
"instance": "bar",
|
||||
"region": "east",
|
||||
"instance": "bar",
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -516,9 +681,8 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
|
||||
Labels: map[string]string{
|
||||
alertGroupNameLabel: "",
|
||||
"instance": "foo",
|
||||
"region": "east",
|
||||
"instance": "foo",
|
||||
"region": "east",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Too high connection number for "foo" for region east`,
|
||||
@@ -527,9 +691,8 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
},
|
||||
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
|
||||
Labels: map[string]string{
|
||||
alertGroupNameLabel: "",
|
||||
"instance": "bar",
|
||||
"region": "east",
|
||||
"instance": "bar",
|
||||
"region": "east",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Too high connection number for "bar" for region east`,
|
||||
@@ -544,8 +707,9 @@ func TestAlertingRule_Template(t *testing.T) {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
tc.rule.q = fq
|
||||
fq.add(tc.metrics...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
|
||||
if _, err := tc.rule.Exec(context.TODO()); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
for hash, expAlert := range tc.expAlerts {
|
||||
@@ -575,5 +739,5 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
}
|
||||
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor, EvalInterval: waitFor}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
||||
@@ -16,7 +15,6 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
@@ -25,10 +23,14 @@ import (
|
||||
type Group struct {
|
||||
Type datasource.Type `yaml:"type,omitempty"`
|
||||
File string
|
||||
Name string `yaml:"name"`
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
Name string `yaml:"name"`
|
||||
Interval utils.PromDuration `yaml:"interval,omitempty"`
|
||||
Rules []Rule `yaml:"rules"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
// ExtraFilterLabels is a list label filters applied to every rule
|
||||
// request withing a group. Is compatible only with VM datasources.
|
||||
// See https://docs.victoriametrics.com#prometheus-querying-api-enhancements
|
||||
ExtraFilterLabels map[string]string `yaml:"extra_filter_labels"`
|
||||
// Checksum stores the hash of yaml definition for this group.
|
||||
// May be used to detect any changes like rules re-ordering etc.
|
||||
Checksum string
|
||||
@@ -115,54 +117,18 @@ func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
ID uint64
|
||||
Type datasource.Type `yaml:"type,omitempty"`
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For PromDuration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
Type datasource.Type `yaml:"type,omitempty"`
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For utils.PromDuration `yaml:"for"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
// PromDuration is Prometheus duration.
|
||||
type PromDuration struct {
|
||||
milliseconds int64
|
||||
}
|
||||
|
||||
// NewPromDuration returns PromDuration for given d.
|
||||
func NewPromDuration(d time.Duration) PromDuration {
|
||||
return PromDuration{
|
||||
milliseconds: d.Milliseconds(),
|
||||
}
|
||||
}
|
||||
|
||||
// MarshalYAML implements yaml.Marshaler interface.
|
||||
func (pd PromDuration) MarshalYAML() (interface{}, error) {
|
||||
return pd.Duration().String(), nil
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements yaml.Unmarshaler interface.
|
||||
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
var s string
|
||||
if err := unmarshal(&s); err != nil {
|
||||
return err
|
||||
}
|
||||
ms, err := metricsql.DurationValue(s, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pd.milliseconds = ms
|
||||
return nil
|
||||
}
|
||||
|
||||
// Duration returns duration for pd.
|
||||
func (pd *PromDuration) Duration() time.Duration {
|
||||
return time.Duration(pd.milliseconds) * time.Millisecond
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
type rule Rule
|
||||
|
||||
@@ -8,10 +8,9 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
@@ -264,7 +263,7 @@ func TestGroup_Validate(t *testing.T) {
|
||||
Rules: []Rule{
|
||||
{
|
||||
Expr: "sumSeries(time('foo.bar',10))",
|
||||
For: PromDuration{milliseconds: 10},
|
||||
For: utils.NewPromDuration(10 * time.Millisecond),
|
||||
},
|
||||
{
|
||||
Expr: "sum(up == 0 ) by (host)",
|
||||
@@ -280,7 +279,7 @@ func TestGroup_Validate(t *testing.T) {
|
||||
Rules: []Rule{
|
||||
{
|
||||
Expr: "sum(up == 0 ) by (host)",
|
||||
For: PromDuration{milliseconds: 10},
|
||||
For: utils.NewPromDuration(10 * time.Millisecond),
|
||||
},
|
||||
{
|
||||
Expr: "sumSeries(time('foo.bar',10))",
|
||||
@@ -348,7 +347,7 @@ func TestHashRule(t *testing.T) {
|
||||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: NewPromDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: utils.NewPromDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
|
||||
15
app/vmalert/config/testdata/rules-query-good.rules
vendored
Normal file
15
app/vmalert/config/testdata/rules-query-good.rules
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
message: |
|
||||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
|
||||
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
|
||||
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
|
||||
{{ end }}
|
||||
expr: |
|
||||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="openshift-monitoring"})) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
39
app/vmalert/config/testdata/rules-replay-good.rules
vendored
Normal file
39
app/vmalert/config/testdata/rules-replay-good.rules
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
groups:
|
||||
- name: ReplayGroup
|
||||
interval: 1m
|
||||
concurrency: 1
|
||||
rules:
|
||||
- record: type:vm_cache_entries:rate5m
|
||||
expr: sum(rate(vm_cache_entries[5m])) by (type)
|
||||
labels:
|
||||
recording: true
|
||||
- record: go_cgo_calls_count:rate5m
|
||||
expr: rate(go_cgo_calls_count{job="vmdb"}[5m])
|
||||
labels:
|
||||
recording: true
|
||||
|
||||
- name: vmsingleReplay
|
||||
interval: 30s
|
||||
concurrency: 2
|
||||
rules:
|
||||
- alert: RequestErrorsToAPI
|
||||
expr: increase(vm_http_request_errors_total[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
|
||||
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
|
||||
description: "Requests to path {{ $labels.path }} are receiving errors.
|
||||
Please verify if clients are sending correct requests."
|
||||
|
||||
- alert: TooManyLogs
|
||||
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=67&var-instance={{ $labels.instance }}"
|
||||
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
||||
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
|
||||
Worth to check logs for specific error messages."
|
||||
@@ -2,6 +2,8 @@ groups:
|
||||
- name: TestGroup
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
extra_filter_labels:
|
||||
job: victoriametrics
|
||||
rules:
|
||||
- alert: Conns
|
||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||
|
||||
@@ -2,21 +2,33 @@ package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Querier interface wraps Query method which
|
||||
// executes given query and returns list of Metrics
|
||||
// as result
|
||||
// Querier interface wraps Query and QueryRange methods
|
||||
type Querier interface {
|
||||
Query(ctx context.Context, query string, engine Type) ([]Metric, error)
|
||||
Query(ctx context.Context, query string) ([]Metric, error)
|
||||
QueryRange(ctx context.Context, query string, from, to time.Time) ([]Metric, error)
|
||||
}
|
||||
|
||||
// QuerierBuilder builds Querier with given params.
|
||||
type QuerierBuilder interface {
|
||||
BuildWithParams(params QuerierParams) Querier
|
||||
}
|
||||
|
||||
// QuerierParams params for Querier.
|
||||
type QuerierParams struct {
|
||||
DataSourceType *Type
|
||||
EvaluationInterval time.Duration
|
||||
// see https://docs.victoriametrics.com/#prometheus-querying-api-enhancements
|
||||
ExtraLabels map[string]string
|
||||
}
|
||||
|
||||
// Metric is the basic entity which should be return by datasource
|
||||
// It represents single data point with full list of labels
|
||||
type Metric struct {
|
||||
Labels []Label
|
||||
Timestamp int64
|
||||
Value float64
|
||||
Labels []Label
|
||||
Timestamps []int64
|
||||
Values []float64
|
||||
}
|
||||
|
||||
// SetLabel adds or updates existing one label
|
||||
|
||||
18
app/vmalert/datasource/datasource_test.go
Normal file
18
app/vmalert/datasource/datasource_test.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package datasource
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestMetric_Label(t *testing.T) {
|
||||
m := &Metric{}
|
||||
|
||||
m.AddLabel("foo", "bar")
|
||||
checkEqualString(t, "bar", m.Label("foo"))
|
||||
|
||||
m.SetLabel("foo", "baz")
|
||||
checkEqualString(t, "baz", m.Label("foo"))
|
||||
|
||||
m.SetLabel("qux", "quux")
|
||||
checkEqualString(t, "quux", m.Label("qux"))
|
||||
|
||||
checkEqualString(t, "", m.Label("non-existing"))
|
||||
}
|
||||
@@ -4,43 +4,59 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to VMSelect URL.")
|
||||
addr = flag.String("datasource.url", "", "VictoriaMetrics or vmselect url. Required parameter. "+
|
||||
"E.g. http://127.0.0.1:8428")
|
||||
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
|
||||
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
|
||||
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
|
||||
|
||||
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
|
||||
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
|
||||
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
|
||||
"By default system CA is used")
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
|
||||
"By default the server name from -datasource.url is used")
|
||||
tlsCAFile = flag.String("datasource.tlsCAFile", "", `Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used`)
|
||||
tlsServerName = flag.String("datasource.tlsServerName", "", `Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used`)
|
||||
|
||||
lookBack = flag.Duration("datasource.lookback", 0, "Lookback defines how far to look into past when evaluating queries. "+
|
||||
"For example, if datasource.lookback=5m then param \"time\" with value now()-5m will be added to every query.")
|
||||
lookBack = flag.Duration("datasource.lookback", 0, `Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
|
||||
queryStep = flag.Duration("datasource.queryStep", 0, "queryStep defines how far a value can fallback to when evaluating queries. "+
|
||||
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query.")
|
||||
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, "Defines the number of idle (keep-alive connections) to configured datasource."+
|
||||
"Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state.")
|
||||
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query."+
|
||||
"If queryStep isn't specified, rule's evaluationInterval will be used instead.")
|
||||
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
|
||||
roundDigits = flag.Int("datasource.roundDigits", 0, `Adds "round_digits" GET param to datasource requests. `+
|
||||
`In VM "round_digits" limits the number of digits after the decimal point in response values.`)
|
||||
)
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
func Init() (Querier, error) {
|
||||
func Init() (QuerierBuilder, error) {
|
||||
if *addr == "" {
|
||||
return nil, fmt.Errorf("datasource.url is empty")
|
||||
}
|
||||
|
||||
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
tr.MaxIdleConns = *maxIdleConnections
|
||||
c := &http.Client{Transport: tr}
|
||||
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, *lookBack, *queryStep, *appendTypePrefix, c), nil
|
||||
tr.MaxIdleConnsPerHost = *maxIdleConnections
|
||||
|
||||
var rd string
|
||||
if *roundDigits > 0 {
|
||||
rd = fmt.Sprintf("%d", *roundDigits)
|
||||
}
|
||||
|
||||
return &VMStorage{
|
||||
c: &http.Client{Transport: tr},
|
||||
basicAuthUser: *basicAuthUsername,
|
||||
basicAuthPass: *basicAuthPassword,
|
||||
datasourceURL: strings.TrimSuffix(*addr, "/"),
|
||||
appendTypePrefix: *appendTypePrefix,
|
||||
lookBack: *lookBack,
|
||||
queryStep: *queryStep,
|
||||
roundDigits: rd,
|
||||
dataSourceType: NewPrometheusType(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -2,76 +2,13 @@ package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type response struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
ResultType string `json:"resultType"`
|
||||
Result []struct {
|
||||
Labels map[string]string `json:"metric"`
|
||||
TV [2]interface{} `json:"value"`
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
ErrorType string `json:"errorType"`
|
||||
Error string `json:"error"`
|
||||
}
|
||||
|
||||
func (r response) metrics() ([]Metric, error) {
|
||||
var ms []Metric
|
||||
var m Metric
|
||||
var f float64
|
||||
var err error
|
||||
for i, res := range r.Data.Result {
|
||||
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
|
||||
}
|
||||
m.Labels = nil
|
||||
for k, v := range r.Data.Result[i].Labels {
|
||||
m.AddLabel(k, v)
|
||||
}
|
||||
m.Timestamp = int64(res.TV[0].(float64))
|
||||
m.Value = f
|
||||
ms = append(ms, m)
|
||||
}
|
||||
return ms, nil
|
||||
}
|
||||
|
||||
type graphiteResponse []graphiteResponseTarget
|
||||
|
||||
type graphiteResponseTarget struct {
|
||||
Target string `json:"target"`
|
||||
Tags map[string]string `json:"tags"`
|
||||
DataPoints [][2]float64 `json:"datapoints"`
|
||||
}
|
||||
|
||||
func (r graphiteResponse) metrics() []Metric {
|
||||
var ms []Metric
|
||||
for _, res := range r {
|
||||
if len(res.DataPoints) < 1 {
|
||||
continue
|
||||
}
|
||||
var m Metric
|
||||
// add only last value to the result.
|
||||
last := res.DataPoints[len(res.DataPoints)-1]
|
||||
m.Value = last[0]
|
||||
m.Timestamp = int64(last[1])
|
||||
for k, v := range res.Tags {
|
||||
m.AddLabel(k, v)
|
||||
}
|
||||
ms = append(ms, m)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
|
||||
// VMStorage represents vmstorage entity with ability to read and write metrics
|
||||
type VMStorage struct {
|
||||
c *http.Client
|
||||
@@ -81,13 +18,43 @@ type VMStorage struct {
|
||||
appendTypePrefix bool
|
||||
lookBack time.Duration
|
||||
queryStep time.Duration
|
||||
roundDigits string
|
||||
|
||||
dataSourceType Type
|
||||
evaluationInterval time.Duration
|
||||
extraLabels []string
|
||||
}
|
||||
|
||||
const queryPath = "/api/v1/query"
|
||||
const graphitePath = "/render"
|
||||
// Clone makes clone of VMStorage, shares http client.
|
||||
func (s *VMStorage) Clone() *VMStorage {
|
||||
return &VMStorage{
|
||||
c: s.c,
|
||||
datasourceURL: s.datasourceURL,
|
||||
basicAuthUser: s.basicAuthUser,
|
||||
basicAuthPass: s.basicAuthPass,
|
||||
lookBack: s.lookBack,
|
||||
queryStep: s.queryStep,
|
||||
appendTypePrefix: s.appendTypePrefix,
|
||||
dataSourceType: s.dataSourceType,
|
||||
}
|
||||
}
|
||||
|
||||
const prometheusPrefix = "/prometheus"
|
||||
const graphitePrefix = "/graphite"
|
||||
// ApplyParams - changes given querier params.
|
||||
func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
|
||||
if params.DataSourceType != nil {
|
||||
s.dataSourceType = *params.DataSourceType
|
||||
}
|
||||
s.evaluationInterval = params.EvaluationInterval
|
||||
for k, v := range params.ExtraLabels {
|
||||
s.extraLabels = append(s.extraLabels, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// BuildWithParams - implements interface.
|
||||
func (s *VMStorage) BuildWithParams(params QuerierParams) Querier {
|
||||
return s.Clone().ApplyParams(params)
|
||||
}
|
||||
|
||||
// NewVMStorage is a constructor for VMStorage
|
||||
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Duration, queryStep time.Duration, appendTypePrefix bool, c *http.Client) *VMStorage {
|
||||
@@ -99,27 +66,84 @@ func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Du
|
||||
appendTypePrefix: appendTypePrefix,
|
||||
lookBack: lookBack,
|
||||
queryStep: queryStep,
|
||||
dataSourceType: NewPrometheusType(),
|
||||
}
|
||||
}
|
||||
|
||||
// Query reads metrics from datasource by given query and type
|
||||
func (s *VMStorage) Query(ctx context.Context, query string, dataSourceType Type) ([]Metric, error) {
|
||||
switch dataSourceType.name {
|
||||
// Query executes the given query and returns parsed response
|
||||
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
|
||||
req, err := s.newRequestPOST()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts := time.Now()
|
||||
switch s.dataSourceType.name {
|
||||
case "", prometheusType:
|
||||
return s.queryDataSource(ctx, query, s.setPrometheusReqParams, parsePrometheusResponse)
|
||||
s.setPrometheusInstantReqParams(req, query, ts)
|
||||
case graphiteType:
|
||||
return s.queryDataSource(ctx, query, s.setGraphiteReqParams, parseGraphiteResponse)
|
||||
s.setGraphiteReqParams(req, query, ts)
|
||||
default:
|
||||
return nil, fmt.Errorf("engine not found: %q", dataSourceType)
|
||||
return nil, fmt.Errorf("engine not found: %q", s.dataSourceType.name)
|
||||
}
|
||||
|
||||
resp, err := s.do(ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
_ = resp.Body.Close()
|
||||
}()
|
||||
|
||||
parseFn := parsePrometheusResponse
|
||||
if s.dataSourceType.name != prometheusType {
|
||||
parseFn = parseGraphiteResponse
|
||||
}
|
||||
return parseFn(req, resp)
|
||||
}
|
||||
|
||||
func (s *VMStorage) queryDataSource(
|
||||
ctx context.Context,
|
||||
query string,
|
||||
setReqParams func(r *http.Request, query string),
|
||||
processResponse func(r *http.Request, resp *http.Response,
|
||||
) ([]Metric, error)) ([]Metric, error) {
|
||||
// QueryRange executes the given query on the given time range.
|
||||
// For Prometheus type see https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries
|
||||
// Graphite type isn't supported.
|
||||
func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end time.Time) ([]Metric, error) {
|
||||
if s.dataSourceType.name != prometheusType {
|
||||
return nil, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType.name)
|
||||
}
|
||||
req, err := s.newRequestPOST()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if start.IsZero() {
|
||||
return nil, fmt.Errorf("start param is missing")
|
||||
}
|
||||
if end.IsZero() {
|
||||
return nil, fmt.Errorf("end param is missing")
|
||||
}
|
||||
s.setPrometheusRangeReqParams(req, query, start, end)
|
||||
resp, err := s.do(ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
_ = resp.Body.Close()
|
||||
}()
|
||||
return parsePrometheusResponse(req, resp)
|
||||
}
|
||||
|
||||
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (s *VMStorage) newRequestPOST() (*http.Request, error) {
|
||||
req, err := http.NewRequest("POST", s.datasourceURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -128,79 +152,5 @@ func (s *VMStorage) queryDataSource(
|
||||
if s.basicAuthPass != "" {
|
||||
req.SetBasicAuth(s.basicAuthUser, s.basicAuthPass)
|
||||
}
|
||||
setReqParams(req, query)
|
||||
resp, err := s.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
|
||||
}
|
||||
return processResponse(req, resp)
|
||||
}
|
||||
|
||||
func (s *VMStorage) setPrometheusReqParams(r *http.Request, query string) {
|
||||
if s.appendTypePrefix {
|
||||
r.URL.Path += prometheusPrefix
|
||||
}
|
||||
r.URL.Path += queryPath
|
||||
q := r.URL.Query()
|
||||
q.Set("query", query)
|
||||
if s.lookBack > 0 {
|
||||
lookBack := time.Now().Add(-s.lookBack)
|
||||
q.Set("time", fmt.Sprintf("%d", lookBack.Unix()))
|
||||
}
|
||||
if s.queryStep > 0 {
|
||||
q.Set("step", s.queryStep.String())
|
||||
}
|
||||
r.URL.RawQuery = q.Encode()
|
||||
}
|
||||
|
||||
func (s *VMStorage) setGraphiteReqParams(r *http.Request, query string) {
|
||||
if s.appendTypePrefix {
|
||||
r.URL.Path += graphitePrefix
|
||||
}
|
||||
r.URL.Path += graphitePath
|
||||
q := r.URL.Query()
|
||||
q.Set("format", "json")
|
||||
q.Set("target", query)
|
||||
from := "-5min"
|
||||
if s.lookBack > 0 {
|
||||
lookBack := time.Now().Add(-s.lookBack)
|
||||
from = strconv.FormatInt(lookBack.Unix(), 10)
|
||||
}
|
||||
q.Set("from", from)
|
||||
q.Set("until", "now")
|
||||
r.URL.RawQuery = q.Encode()
|
||||
}
|
||||
|
||||
const (
|
||||
statusSuccess, statusError, rtVector = "success", "error", "vector"
|
||||
)
|
||||
|
||||
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
|
||||
r := &response{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
}
|
||||
if r.Data.ResultType != rtVector {
|
||||
return nil, fmt.Errorf("unknown result type:%s. Expected vector", r.Data.ResultType)
|
||||
}
|
||||
return r.metrics()
|
||||
}
|
||||
|
||||
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
|
||||
r := &graphiteResponse{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
return r.metrics(), nil
|
||||
return req, nil
|
||||
}
|
||||
|
||||
67
app/vmalert/datasource/vm_graphite_api.go
Normal file
67
app/vmalert/datasource/vm_graphite_api.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
type graphiteResponse []graphiteResponseTarget
|
||||
|
||||
type graphiteResponseTarget struct {
|
||||
Target string `json:"target"`
|
||||
Tags map[string]string `json:"tags"`
|
||||
DataPoints [][2]float64 `json:"datapoints"`
|
||||
}
|
||||
|
||||
func (r graphiteResponse) metrics() []Metric {
|
||||
var ms []Metric
|
||||
for _, res := range r {
|
||||
if len(res.DataPoints) < 1 {
|
||||
continue
|
||||
}
|
||||
var m Metric
|
||||
// add only last value to the result.
|
||||
last := res.DataPoints[len(res.DataPoints)-1]
|
||||
m.Values = append(m.Values, last[0])
|
||||
m.Timestamps = append(m.Timestamps, int64(last[1]))
|
||||
for k, v := range res.Tags {
|
||||
m.AddLabel(k, v)
|
||||
}
|
||||
ms = append(ms, m)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
|
||||
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
|
||||
r := &graphiteResponse{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
return r.metrics(), nil
|
||||
}
|
||||
|
||||
const (
|
||||
graphitePath = "/render"
|
||||
graphitePrefix = "/graphite"
|
||||
)
|
||||
|
||||
func (s *VMStorage) setGraphiteReqParams(r *http.Request, query string, timestamp time.Time) {
|
||||
if s.appendTypePrefix {
|
||||
r.URL.Path += graphitePrefix
|
||||
}
|
||||
r.URL.Path += graphitePath
|
||||
q := r.URL.Query()
|
||||
q.Set("format", "json")
|
||||
q.Set("target", query)
|
||||
from := "-5min"
|
||||
if s.lookBack > 0 {
|
||||
lookBack := timestamp.Add(-s.lookBack)
|
||||
from = strconv.FormatInt(lookBack.Unix(), 10)
|
||||
}
|
||||
q.Set("from", from)
|
||||
q.Set("until", "now")
|
||||
r.URL.RawQuery = q.Encode()
|
||||
}
|
||||
165
app/vmalert/datasource/vm_prom_api.go
Normal file
165
app/vmalert/datasource/vm_prom_api.go
Normal file
@@ -0,0 +1,165 @@
|
||||
package datasource
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
type promResponse struct {
|
||||
Status string `json:"status"`
|
||||
ErrorType string `json:"errorType"`
|
||||
Error string `json:"error"`
|
||||
Data struct {
|
||||
ResultType string `json:"resultType"`
|
||||
Result json.RawMessage `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
type promInstant struct {
|
||||
Result []struct {
|
||||
Labels map[string]string `json:"metric"`
|
||||
TV [2]interface{} `json:"value"`
|
||||
} `json:"result"`
|
||||
}
|
||||
|
||||
type promRange struct {
|
||||
Result []struct {
|
||||
Labels map[string]string `json:"metric"`
|
||||
TVs [][2]interface{} `json:"values"`
|
||||
} `json:"result"`
|
||||
}
|
||||
|
||||
func (r promInstant) metrics() ([]Metric, error) {
|
||||
var result []Metric
|
||||
for i, res := range r.Result {
|
||||
f, err := strconv.ParseFloat(res.TV[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
|
||||
}
|
||||
var m Metric
|
||||
for k, v := range r.Result[i].Labels {
|
||||
m.AddLabel(k, v)
|
||||
}
|
||||
m.Timestamps = append(m.Timestamps, int64(res.TV[0].(float64)))
|
||||
m.Values = append(m.Values, f)
|
||||
result = append(result, m)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (r promRange) metrics() ([]Metric, error) {
|
||||
var result []Metric
|
||||
for i, res := range r.Result {
|
||||
var m Metric
|
||||
for _, tv := range res.TVs {
|
||||
f, err := strconv.ParseFloat(tv[1].(string), 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, tv[1], err)
|
||||
}
|
||||
m.Values = append(m.Values, f)
|
||||
m.Timestamps = append(m.Timestamps, int64(tv[0].(float64)))
|
||||
}
|
||||
if len(m.Values) < 1 || len(m.Timestamps) < 1 {
|
||||
return nil, fmt.Errorf("metric %v contains no values", res)
|
||||
}
|
||||
m.Labels = nil
|
||||
for k, v := range r.Result[i].Labels {
|
||||
m.AddLabel(k, v)
|
||||
}
|
||||
result = append(result, m)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
const (
|
||||
statusSuccess, statusError = "success", "error"
|
||||
rtVector, rtMatrix = "vector", "matrix"
|
||||
)
|
||||
|
||||
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
|
||||
r := &promResponse{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
|
||||
}
|
||||
switch r.Data.ResultType {
|
||||
case rtVector:
|
||||
var pi promInstant
|
||||
if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
|
||||
return nil, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
|
||||
}
|
||||
return pi.metrics()
|
||||
case rtMatrix:
|
||||
var pr promRange
|
||||
if err := json.Unmarshal(r.Data.Result, &pr.Result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return pr.metrics()
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown result type %q", r.Data.ResultType)
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
prometheusInstantPath = "/api/v1/query"
|
||||
prometheusRangePath = "/api/v1/query_range"
|
||||
prometheusPrefix = "/prometheus"
|
||||
)
|
||||
|
||||
func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string, timestamp time.Time) {
|
||||
if s.appendTypePrefix {
|
||||
r.URL.Path += prometheusPrefix
|
||||
}
|
||||
r.URL.Path += prometheusInstantPath
|
||||
q := r.URL.Query()
|
||||
if s.lookBack > 0 {
|
||||
timestamp = timestamp.Add(-s.lookBack)
|
||||
}
|
||||
if s.evaluationInterval > 0 {
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
|
||||
timestamp = timestamp.Truncate(s.evaluationInterval)
|
||||
}
|
||||
q.Set("time", fmt.Sprintf("%d", timestamp.Unix()))
|
||||
r.URL.RawQuery = q.Encode()
|
||||
s.setPrometheusReqParams(r, query)
|
||||
}
|
||||
|
||||
func (s *VMStorage) setPrometheusRangeReqParams(r *http.Request, query string, start, end time.Time) {
|
||||
if s.appendTypePrefix {
|
||||
r.URL.Path += prometheusPrefix
|
||||
}
|
||||
r.URL.Path += prometheusRangePath
|
||||
q := r.URL.Query()
|
||||
q.Add("start", fmt.Sprintf("%d", start.Unix()))
|
||||
q.Add("end", fmt.Sprintf("%d", end.Unix()))
|
||||
r.URL.RawQuery = q.Encode()
|
||||
s.setPrometheusReqParams(r, query)
|
||||
}
|
||||
|
||||
func (s *VMStorage) setPrometheusReqParams(r *http.Request, query string) {
|
||||
q := r.URL.Query()
|
||||
q.Set("query", query)
|
||||
if s.evaluationInterval > 0 {
|
||||
// set step as evaluationInterval by default
|
||||
q.Set("step", s.evaluationInterval.String())
|
||||
}
|
||||
if s.queryStep > 0 {
|
||||
// override step with user-specified value
|
||||
q.Set("step", s.queryStep.String())
|
||||
}
|
||||
if s.roundDigits != "" {
|
||||
q.Set("round_digits", s.roundDigits)
|
||||
}
|
||||
for _, l := range s.extraLabels {
|
||||
q.Add("extra_label", l)
|
||||
}
|
||||
r.URL.RawQuery = q.Encode()
|
||||
}
|
||||
@@ -2,9 +2,12 @@ package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -17,7 +20,7 @@ var (
|
||||
queryRender = "constantLine(10)"
|
||||
)
|
||||
|
||||
func TestVMSelectQuery(t *testing.T) {
|
||||
func TestVMInstantQuery(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Errorf("should not be called")
|
||||
@@ -63,32 +66,133 @@ func TestVMSelectQuery(t *testing.T) {
|
||||
case 5:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
|
||||
case 6:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]}]}}`))
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests"},"value":[1583786140,"2000"]}]}}`))
|
||||
}
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
|
||||
s := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
|
||||
|
||||
p := NewPrometheusType()
|
||||
pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second})
|
||||
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected connection error got nil")
|
||||
}
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected invalid response status error got nil")
|
||||
}
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected response body error got nil")
|
||||
}
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected error status got nil")
|
||||
}
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected unknown status got nil")
|
||||
}
|
||||
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
|
||||
if _, err := pq.Query(ctx, query); err == nil {
|
||||
t.Fatalf("expected non-vector resultType error got nil")
|
||||
}
|
||||
m, err := am.Query(ctx, query, NewPrometheusType())
|
||||
m, err := pq.Query(ctx, query)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
if len(m) != 2 {
|
||||
t.Fatalf("expected 2 metrics got %d in %+v", len(m), m)
|
||||
}
|
||||
expected := []Metric{
|
||||
{
|
||||
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
|
||||
Timestamps: []int64{1583786142},
|
||||
Values: []float64{13763},
|
||||
},
|
||||
{
|
||||
Labels: []Label{{Value: "vm_requests", Name: "__name__"}},
|
||||
Timestamps: []int64{1583786140},
|
||||
Values: []float64{2000},
|
||||
},
|
||||
}
|
||||
if !reflect.DeepEqual(m, expected) {
|
||||
t.Fatalf("unexpected metric %+v want %+v", m, expected)
|
||||
}
|
||||
|
||||
g := NewGraphiteType()
|
||||
gq := s.BuildWithParams(QuerierParams{DataSourceType: &g})
|
||||
|
||||
m, err = gq.Query(ctx, queryRender)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
if len(m) != 1 {
|
||||
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
|
||||
}
|
||||
exp := Metric{
|
||||
Labels: []Label{{Value: "constantLine(10)", Name: "name"}},
|
||||
Timestamps: []int64{1611758403},
|
||||
Values: []float64{10},
|
||||
}
|
||||
if !reflect.DeepEqual(m[0], exp) {
|
||||
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVMRangeQuery(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Errorf("should not be called")
|
||||
})
|
||||
c := -1
|
||||
mux.HandleFunc("/api/v1/query_range", func(w http.ResponseWriter, r *http.Request) {
|
||||
c++
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("expected POST method got %s", r.Method)
|
||||
}
|
||||
if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass {
|
||||
t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass)
|
||||
}
|
||||
if r.URL.Query().Get("query") != query {
|
||||
t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query"))
|
||||
}
|
||||
startTS := r.URL.Query().Get("start")
|
||||
if startTS == "" {
|
||||
t.Errorf("expected 'start' in query param, got nil instead")
|
||||
}
|
||||
if _, err := strconv.ParseInt(startTS, 10, 64); err != nil {
|
||||
t.Errorf("failed to parse 'start' query param: %s", err)
|
||||
}
|
||||
endTS := r.URL.Query().Get("end")
|
||||
if endTS == "" {
|
||||
t.Errorf("expected 'end' in query param, got nil instead")
|
||||
}
|
||||
if _, err := strconv.ParseInt(endTS, 10, 64); err != nil {
|
||||
t.Errorf("failed to parse 'end' query param: %s", err)
|
||||
}
|
||||
switch c {
|
||||
case 0:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"vm_rows"},"values":[[1583786142,"13763"]]}]}}`))
|
||||
}
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
|
||||
s := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
|
||||
|
||||
p := NewPrometheusType()
|
||||
pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second})
|
||||
|
||||
_, err := pq.QueryRange(ctx, query, time.Now(), time.Time{})
|
||||
expectError(t, err, "is missing")
|
||||
|
||||
_, err = pq.QueryRange(ctx, query, time.Time{}, time.Now())
|
||||
expectError(t, err, "is missing")
|
||||
|
||||
start, end := time.Now().Add(-time.Minute), time.Now()
|
||||
|
||||
m, err := pq.QueryRange(ctx, query, start, end)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -96,32 +200,263 @@ func TestVMSelectQuery(t *testing.T) {
|
||||
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
|
||||
}
|
||||
expected := Metric{
|
||||
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
|
||||
Timestamp: 1583786142,
|
||||
Value: 13763,
|
||||
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
|
||||
Timestamps: []int64{1583786142},
|
||||
Values: []float64{13763},
|
||||
}
|
||||
if m[0].Timestamp != expected.Timestamp &&
|
||||
m[0].Value != expected.Value &&
|
||||
m[0].Labels[0].Value != expected.Labels[0].Value &&
|
||||
m[0].Labels[0].Name != expected.Labels[0].Name {
|
||||
if !reflect.DeepEqual(m[0], expected) {
|
||||
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
|
||||
}
|
||||
m, err = am.Query(ctx, queryRender, NewGraphiteType())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
|
||||
g := NewGraphiteType()
|
||||
gq := s.BuildWithParams(QuerierParams{DataSourceType: &g})
|
||||
|
||||
_, err = gq.QueryRange(ctx, queryRender, start, end)
|
||||
expectError(t, err, "is not supported")
|
||||
}
|
||||
|
||||
func TestRequestParams(t *testing.T) {
|
||||
query := "up"
|
||||
timestamp := time.Date(2001, 2, 3, 4, 5, 6, 0, time.UTC)
|
||||
testCases := []struct {
|
||||
name string
|
||||
queryRange bool
|
||||
vm *VMStorage
|
||||
checkFn func(t *testing.T, r *http.Request)
|
||||
}{
|
||||
{
|
||||
"prometheus path",
|
||||
false,
|
||||
&VMStorage{
|
||||
dataSourceType: NewPrometheusType(),
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, prometheusInstantPath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"prometheus prefix",
|
||||
false,
|
||||
&VMStorage{
|
||||
dataSourceType: NewPrometheusType(),
|
||||
appendTypePrefix: true,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, prometheusPrefix+prometheusInstantPath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"prometheus range path",
|
||||
true,
|
||||
&VMStorage{
|
||||
dataSourceType: NewPrometheusType(),
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, prometheusRangePath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"prometheus range prefix",
|
||||
true,
|
||||
&VMStorage{
|
||||
dataSourceType: NewPrometheusType(),
|
||||
appendTypePrefix: true,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, prometheusPrefix+prometheusRangePath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"graphite path",
|
||||
false,
|
||||
&VMStorage{
|
||||
dataSourceType: NewGraphiteType(),
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, graphitePath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"graphite prefix",
|
||||
false,
|
||||
&VMStorage{
|
||||
dataSourceType: NewGraphiteType(),
|
||||
appendTypePrefix: true,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
checkEqualString(t, graphitePrefix+graphitePath, r.URL.Path)
|
||||
},
|
||||
},
|
||||
{
|
||||
"default params",
|
||||
false,
|
||||
&VMStorage{},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("query=%s&time=%d", query, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"default range params",
|
||||
true,
|
||||
&VMStorage{},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("end=%d&query=%s&start=%d", timestamp.Unix(), query, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"basic auth",
|
||||
false,
|
||||
&VMStorage{
|
||||
basicAuthUser: "foo",
|
||||
basicAuthPass: "bar",
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
u, p, _ := r.BasicAuth()
|
||||
checkEqualString(t, "foo", u)
|
||||
checkEqualString(t, "bar", p)
|
||||
},
|
||||
},
|
||||
{
|
||||
"basic auth range",
|
||||
true,
|
||||
&VMStorage{
|
||||
basicAuthUser: "foo",
|
||||
basicAuthPass: "bar",
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
u, p, _ := r.BasicAuth()
|
||||
checkEqualString(t, "foo", u)
|
||||
checkEqualString(t, "bar", p)
|
||||
},
|
||||
},
|
||||
{
|
||||
"lookback",
|
||||
false,
|
||||
&VMStorage{
|
||||
lookBack: time.Minute,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("query=%s&time=%d", query, timestamp.Add(-time.Minute).Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"evaluation interval",
|
||||
false,
|
||||
&VMStorage{
|
||||
evaluationInterval: 15 * time.Second,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
evalInterval := 15 * time.Second
|
||||
tt := timestamp.Truncate(evalInterval)
|
||||
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, evalInterval, tt.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"lookback + evaluation interval",
|
||||
false,
|
||||
&VMStorage{
|
||||
lookBack: time.Minute,
|
||||
evaluationInterval: 15 * time.Second,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
evalInterval := 15 * time.Second
|
||||
tt := timestamp.Add(-time.Minute)
|
||||
tt = tt.Truncate(evalInterval)
|
||||
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, evalInterval, tt.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"step override",
|
||||
false,
|
||||
&VMStorage{
|
||||
queryStep: time.Minute,
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, time.Minute, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"round digits",
|
||||
false,
|
||||
&VMStorage{
|
||||
roundDigits: "10",
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("query=%s&round_digits=10&time=%d", query, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"extra labels",
|
||||
false,
|
||||
&VMStorage{
|
||||
extraLabels: []string{
|
||||
"env=prod",
|
||||
"query=es=cape",
|
||||
},
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("extra_label=env%%3Dprod&extra_label=query%%3Des%%3Dcape&query=%s&time=%d", query, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
{
|
||||
"extra labels range",
|
||||
true,
|
||||
&VMStorage{
|
||||
extraLabels: []string{
|
||||
"env=prod",
|
||||
"query=es=cape",
|
||||
},
|
||||
},
|
||||
func(t *testing.T, r *http.Request) {
|
||||
exp := fmt.Sprintf("end=%d&extra_label=env%%3Dprod&extra_label=query%%3Des%%3Dcape&query=%s&start=%d",
|
||||
timestamp.Unix(), query, timestamp.Unix())
|
||||
checkEqualString(t, exp, r.URL.RawQuery)
|
||||
},
|
||||
},
|
||||
}
|
||||
if len(m) != 1 {
|
||||
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
|
||||
}
|
||||
expected = Metric{
|
||||
Labels: []Label{{Value: "constantLine(10)", Name: "name"}},
|
||||
Timestamp: 1611758403,
|
||||
Value: 10,
|
||||
}
|
||||
if m[0].Timestamp != expected.Timestamp &&
|
||||
m[0].Value != expected.Value &&
|
||||
m[0].Labels[0].Value != expected.Labels[0].Value &&
|
||||
m[0].Labels[0].Name != expected.Labels[0].Name {
|
||||
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
req, err := tc.vm.newRequestPOST()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
switch tc.vm.dataSourceType.name {
|
||||
case "", prometheusType:
|
||||
if tc.queryRange {
|
||||
tc.vm.setPrometheusRangeReqParams(req, query, timestamp, timestamp)
|
||||
} else {
|
||||
tc.vm.setPrometheusInstantReqParams(req, query, timestamp)
|
||||
}
|
||||
case graphiteType:
|
||||
tc.vm.setGraphiteReqParams(req, query, timestamp)
|
||||
}
|
||||
tc.checkFn(t, req)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func checkEqualString(t *testing.T, exp, got string) {
|
||||
t.Helper()
|
||||
if got != exp {
|
||||
t.Errorf("expected to get %q; got %q", exp, got)
|
||||
}
|
||||
}
|
||||
|
||||
func expectError(t *testing.T, err error, exp string) {
|
||||
t.Helper()
|
||||
if err == nil {
|
||||
t.Errorf("expected non-nil error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), exp) {
|
||||
t.Errorf("expected error %q to contain %q", err, exp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,14 +18,15 @@ import (
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
type Group struct {
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type datasource.Type
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
Checksum string
|
||||
mu sync.RWMutex
|
||||
Name string
|
||||
File string
|
||||
Rules []Rule
|
||||
Type datasource.Type
|
||||
Interval time.Duration
|
||||
Concurrency int
|
||||
Checksum string
|
||||
ExtraFilterLabels map[string]string
|
||||
|
||||
doneCh chan struct{}
|
||||
finishedCh chan struct{}
|
||||
@@ -49,17 +50,19 @@ func newGroupMetrics(name, file string) *groupMetrics {
|
||||
return m
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
g := &Group{
|
||||
Type: cfg.Type,
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval,
|
||||
Concurrency: cfg.Concurrency,
|
||||
Checksum: cfg.Checksum,
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
Type: cfg.Type,
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
Interval: cfg.Interval.Duration(),
|
||||
Concurrency: cfg.Concurrency,
|
||||
Checksum: cfg.Checksum,
|
||||
ExtraFilterLabels: cfg.ExtraFilterLabels,
|
||||
|
||||
doneCh: make(chan struct{}),
|
||||
finishedCh: make(chan struct{}),
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
g.metrics = newGroupMetrics(g.Name, g.File)
|
||||
if g.Interval == 0 {
|
||||
@@ -81,17 +84,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string
|
||||
}
|
||||
r.Labels[k] = v
|
||||
}
|
||||
rules[i] = g.newRule(r)
|
||||
rules[i] = g.newRule(qb, r)
|
||||
}
|
||||
g.Rules = rules
|
||||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(rule config.Rule) Rule {
|
||||
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(g, rule)
|
||||
return newAlertingRule(qb, g, rule)
|
||||
}
|
||||
return newRecordingRule(g, rule)
|
||||
return newRecordingRule(qb, g, rule)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
@@ -106,7 +109,7 @@ func (g *Group) ID() uint64 {
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, lookback time.Duration, labels map[string]string) error {
|
||||
for _, rule := range g.Rules {
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
@@ -115,6 +118,9 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
// ignore g.ExtraFilterLabels on purpose, so it
|
||||
// won't affect the restore procedure.
|
||||
q := qb.BuildWithParams(datasource.QuerierParams{})
|
||||
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
@@ -162,6 +168,7 @@ func (g *Group) updateWith(newGroup *Group) error {
|
||||
}
|
||||
g.Type = newGroup.Type
|
||||
g.Concurrency = newGroup.Concurrency
|
||||
g.ExtraFilterLabels = newGroup.ExtraFilterLabels
|
||||
g.Checksum = newGroup.Checksum
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
@@ -189,7 +196,7 @@ func (g *Group) close() {
|
||||
|
||||
var skipRandSleepOnGroupStart bool
|
||||
|
||||
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewrite.Client) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
@@ -213,7 +220,7 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []not
|
||||
}
|
||||
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{querier, nts, rw}
|
||||
e := &executor{nts, rw}
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
@@ -256,22 +263,16 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []not
|
||||
}
|
||||
|
||||
type executor struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
var returnSeries bool
|
||||
if e.rw != nil {
|
||||
returnSeries = true
|
||||
}
|
||||
|
||||
if concurrency == 1 {
|
||||
// fast path
|
||||
for _, rule := range rules {
|
||||
res <- e.exec(ctx, rule, returnSeries, interval)
|
||||
res <- e.exec(ctx, rule, interval)
|
||||
}
|
||||
close(res)
|
||||
return res
|
||||
@@ -284,7 +285,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, returnSeries, interval)
|
||||
res <- e.exec(ctx, r, interval)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
@@ -303,14 +304,14 @@ var (
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
)
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
|
||||
execTotal.Inc()
|
||||
execStart := time.Now()
|
||||
defer func() {
|
||||
execDuration.UpdateDuration(execStart)
|
||||
}()
|
||||
|
||||
tss, err := rule.Exec(ctx, e.querier, returnSeries)
|
||||
tss, err := rule.Exec(ctx)
|
||||
if err != nil {
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
|
||||
@@ -7,7 +7,9 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -32,7 +34,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 0",
|
||||
For: config.NewPromDuration(time.Second),
|
||||
For: utils.NewPromDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
@@ -44,7 +46,7 @@ func TestUpdateWith(t *testing.T) {
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: config.NewPromDuration(time.Second),
|
||||
For: utils.NewPromDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
@@ -105,20 +107,32 @@ func TestUpdateWith(t *testing.T) {
|
||||
{Record: "foo5"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"update datasource type",
|
||||
[]config.Rule{
|
||||
{Alert: "foo1", Type: datasource.NewPrometheusType()},
|
||||
{Alert: "foo3", Type: datasource.NewGraphiteType()},
|
||||
},
|
||||
[]config.Rule{
|
||||
{Alert: "foo1", Type: datasource.NewGraphiteType()},
|
||||
{Alert: "foo10", Type: datasource.NewPrometheusType()},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Name: "test"}
|
||||
qb := &fakeQuerier{}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(r))
|
||||
g.Rules = append(g.Rules, g.newRule(qb, r))
|
||||
}
|
||||
|
||||
ng := &Group{Name: "test"}
|
||||
for _, r := range tc.newRules {
|
||||
r.ID = config.HashRule(r)
|
||||
ng.Rules = append(ng.Rules, ng.newRule(r))
|
||||
ng.Rules = append(ng.Rules, ng.newRule(qb, r))
|
||||
}
|
||||
|
||||
err := g.updateWith(ng)
|
||||
@@ -156,11 +170,11 @@ func TestGroupStart(t *testing.T) {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fs := &fakeQuerier{}
|
||||
fn := &fakeNotifier{}
|
||||
|
||||
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
const inst1, inst2, job = "foo", "bar", "baz"
|
||||
m1 := metricWithLabels(t, "instance", inst1, "job", job)
|
||||
@@ -195,7 +209,7 @@ func TestGroupStart(t *testing.T) {
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
|
||||
g.start(context.Background(), []notifier.Notifier{fn}, nil)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
@@ -38,7 +39,15 @@ func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ datasource.Type) ([]datasource.Metric, error) {
|
||||
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
|
||||
return fq
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) ([]datasource.Metric, error) {
|
||||
return fq.Query(ctx, q)
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
@@ -68,9 +77,16 @@ func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
return metricWithValuesAndLabels(t, []float64{value}, labels...)
|
||||
}
|
||||
|
||||
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Value = value
|
||||
m.Values = values
|
||||
for i := range values {
|
||||
m.Timestamps = append(m.Timestamps, int64(i))
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
@@ -79,7 +95,7 @@ func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{}
|
||||
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
@@ -160,6 +176,9 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
||||
if !reflect.DeepEqual(a.Labels, b.Labels) {
|
||||
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
|
||||
}
|
||||
if a.Type.String() != b.Type.String() {
|
||||
return fmt.Errorf("expected to have Type %#v; got %#v", a.Type.String(), b.Type.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -26,14 +26,17 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
|
||||
Supports patterns. Flag can be specified multiple times.
|
||||
Examples:
|
||||
-rule="/path/to/file". Path to a single file with alerting rules
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
|
||||
absolute path to all .yaml files in root.
|
||||
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.`)
|
||||
|
||||
rulesCheckInterval = flag.Duration("rule.configCheckInterval", 0, "Interval for checking for changes in '-rule' files. "+
|
||||
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes")
|
||||
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
||||
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
|
||||
|
||||
@@ -41,12 +44,13 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
|
||||
"Pass multiple -label flags in order to add multiple label sets.")
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
|
||||
)
|
||||
@@ -64,42 +68,54 @@ func main() {
|
||||
notifier.InitTemplateFunc(u)
|
||||
groups, err := config.Parse(*rulePath, true, true)
|
||||
if err != nil {
|
||||
logger.Fatalf(err.Error())
|
||||
logger.Fatalf("failed to parse %q: %s", *rulePath, err)
|
||||
}
|
||||
if len(groups) == 0 {
|
||||
logger.Fatalf("No rules for validation. Please specify path to file(s) with alerting and/or recording rules using `-rule` flag")
|
||||
}
|
||||
return
|
||||
}
|
||||
if *replayFrom != "" || *replayTo != "" {
|
||||
rw, err := remotewrite.Init(context.Background())
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init remoteWrite: %s", err)
|
||||
}
|
||||
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init `external.url`: %s", err)
|
||||
}
|
||||
notifier.InitTemplateFunc(eu)
|
||||
groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot parse configuration file: %s", err)
|
||||
}
|
||||
q, err := datasource.Init()
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init datasource: %s", err)
|
||||
}
|
||||
if err := replay(groupsCfg, q, rw); err != nil {
|
||||
logger.Fatalf("replay failed: %s", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
manager, err := newManager(ctx)
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init: %s", err)
|
||||
}
|
||||
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
|
||||
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(*rulePath, ";"))
|
||||
groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot parse configuration file: %s", err)
|
||||
}
|
||||
|
||||
if err := manager.start(ctx, groupsCfg); err != nil {
|
||||
logger.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
go func() {
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
sigHup := procutil.NewSighupChan()
|
||||
for {
|
||||
<-sigHup
|
||||
configReloads.Inc()
|
||||
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
|
||||
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("error while reloading rules: %s", err)
|
||||
continue
|
||||
}
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}()
|
||||
go configReload(ctx, manager, groupsCfg)
|
||||
|
||||
rh := &requestHandler{m: manager}
|
||||
go httpserver.Serve(*httpListenAddr, rh.handler)
|
||||
@@ -140,10 +156,10 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
}
|
||||
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: q,
|
||||
notifiers: nts,
|
||||
labels: map[string]string{},
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: q,
|
||||
notifiers: nts,
|
||||
labels: map[string]string{},
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
@@ -218,7 +234,66 @@ func usage() {
|
||||
const s = `
|
||||
vmalert processes alerts and recording rules.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vmalert.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmalert.html .
|
||||
`
|
||||
flagutil.Usage(s)
|
||||
}
|
||||
|
||||
func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
|
||||
// Register SIGHUP handler for config re-read just before manager.start call.
|
||||
// This guarantees that the config will be re-read if the signal arrives during manager.start call.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
|
||||
var configCheckCh <-chan time.Time
|
||||
if *rulesCheckInterval > 0 {
|
||||
ticker := time.NewTicker(*rulesCheckInterval)
|
||||
configCheckCh = ticker.C
|
||||
defer ticker.Stop()
|
||||
}
|
||||
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-sighupCh:
|
||||
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
|
||||
configReloads.Inc()
|
||||
case <-configCheckCh:
|
||||
}
|
||||
newGroupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
|
||||
if err != nil {
|
||||
logger.Errorf("cannot parse configuration file: %s", err)
|
||||
continue
|
||||
}
|
||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||
// config didn't change - skip it
|
||||
continue
|
||||
}
|
||||
groupsCfg = newGroupsCfg
|
||||
if err := m.update(ctx, groupsCfg, false); err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("error while reloading rules: %s", err)
|
||||
continue
|
||||
}
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}
|
||||
|
||||
func configsEqual(a, b []config.Group) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i].Checksum != b[i].Checksum {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
@@ -51,3 +55,95 @@ func TestGetAlertURLGenerator(t *testing.T) {
|
||||
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfigReload(t *testing.T) {
|
||||
originalRulePath := *rulePath
|
||||
defer func() {
|
||||
*rulePath = originalRulePath
|
||||
}()
|
||||
|
||||
const (
|
||||
rules1 = `
|
||||
groups:
|
||||
- name: group-1
|
||||
rules:
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job) (up == 1)
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
`
|
||||
rules2 = `
|
||||
groups:
|
||||
- name: group-1
|
||||
rules:
|
||||
- alert: ExampleAlertAlwaysFiring
|
||||
expr: sum by(job) (up == 1)
|
||||
- name: group-2
|
||||
rules:
|
||||
- record: handler:requests:rate5m
|
||||
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
|
||||
`
|
||||
)
|
||||
|
||||
f, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
writeToFile(t, f.Name(), rules1)
|
||||
|
||||
*rulesCheckInterval = 200 * time.Millisecond
|
||||
*rulePath = []string{f.Name()}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
m := &manager{
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*Group),
|
||||
labels: map[string]string{},
|
||||
}
|
||||
go configReload(ctx, m, nil)
|
||||
|
||||
lenLocked := func(m *manager) int {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
return len(m.groups)
|
||||
}
|
||||
|
||||
time.Sleep(*rulesCheckInterval * 2)
|
||||
groupsLen := lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
}
|
||||
|
||||
writeToFile(t, f.Name(), rules2)
|
||||
time.Sleep(*rulesCheckInterval * 2)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 2 {
|
||||
fmt.Println(m.groups)
|
||||
t.Fatalf("expected to have exactly 2 groups loaded; got %d", groupsLen)
|
||||
}
|
||||
|
||||
writeToFile(t, f.Name(), rules1)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*rulesCheckInterval / 2)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
}
|
||||
|
||||
writeToFile(t, f.Name(), `corrupted`)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*rulesCheckInterval / 2)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 { // should remain unchanged
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
}
|
||||
}
|
||||
|
||||
func writeToFile(t *testing.T, file, b string) {
|
||||
t.Helper()
|
||||
err := ioutil.WriteFile(file, []byte(b), 0644)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
@@ -15,11 +14,12 @@ import (
|
||||
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
querier datasource.Querier
|
||||
notifiers []notifier.Notifier
|
||||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rr datasource.Querier
|
||||
// remote read builder.
|
||||
rr datasource.QuerierBuilder
|
||||
|
||||
wg sync.WaitGroup
|
||||
labels map[string]string
|
||||
@@ -49,8 +49,8 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
||||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
|
||||
return m.update(ctx, groupsCfg, true)
|
||||
}
|
||||
|
||||
func (m *manager) close() {
|
||||
@@ -63,10 +63,13 @@ func (m *manager) close() {
|
||||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||
if err != nil {
|
||||
if !*remoteReadIgnoreRestoreErrors {
|
||||
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
|
||||
}
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
}
|
||||
}
|
||||
@@ -74,22 +77,17 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
m.wg.Add(1)
|
||||
id := group.ID()
|
||||
go func() {
|
||||
group.start(ctx, m.querier, m.notifiers, m.rw)
|
||||
group.start(ctx, m.notifiers, m.rw)
|
||||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = group
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
|
||||
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
|
||||
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse configuration file: %w", err)
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval, m.labels)
|
||||
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
@@ -116,7 +114,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
|
||||
}
|
||||
}
|
||||
for _, ng := range groupsRegistry {
|
||||
m.startGroup(ctx, ng, restore)
|
||||
if err := m.startGroup(ctx, ng, restore); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
m.groupsMu.Unlock()
|
||||
|
||||
@@ -140,12 +140,14 @@ func (g *Group) toAPI() APIGroup {
|
||||
|
||||
ag := APIGroup{
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.String(),
|
||||
Concurrency: g.Concurrency,
|
||||
ExtraFilterLabels: g.ExtraFilterLabels,
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
switch v := r.(type) {
|
||||
|
||||
@@ -9,8 +9,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
)
|
||||
|
||||
@@ -25,9 +25,8 @@ func TestMain(m *testing.M) {
|
||||
// starting with empty rules folder
|
||||
func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
path := []string{"foo/bar"}
|
||||
err := m.update(context.Background(), path, true, true, false)
|
||||
if err != nil {
|
||||
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
|
||||
if err := m.update(context.Background(), cfg, false); err != nil {
|
||||
t.Fatalf("expected to load succesfully with empty rules dir; got err instead: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -37,9 +36,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: []notifier.Notifier{&fakeNotifier{}},
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
@@ -50,8 +49,11 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
"config/testdata/rules1-good.rules",
|
||||
"config/testdata/rules2-good.rules",
|
||||
}
|
||||
evalInterval := *evaluationInterval
|
||||
defer func() { *evaluationInterval = evalInterval }()
|
||||
*evaluationInterval = time.Millisecond
|
||||
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
|
||||
cfg := loadCfg(t, []string{paths[0]}, true, true)
|
||||
if err := m.start(context.Background(), cfg); err != nil {
|
||||
t.Fatalf("failed to start: %s", err)
|
||||
}
|
||||
|
||||
@@ -64,8 +66,11 @@ func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
rnd := rand.Intn(len(paths))
|
||||
path := []string{paths[rnd]}
|
||||
_ = m.update(context.Background(), path, true, true, false)
|
||||
cfg, err := config.Parse([]string{paths[rnd]}, true, true)
|
||||
if err != nil { // update can fail and this is expected
|
||||
continue
|
||||
}
|
||||
_ = m.update(context.Background(), cfg, false)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -242,14 +247,17 @@ func TestManagerUpdate(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
|
||||
path := []string{tc.initPath}
|
||||
if err := m.update(ctx, path, true, true, false); err != nil {
|
||||
m := &manager{groups: make(map[uint64]*Group), querierBuilder: &fakeQuerier{}}
|
||||
|
||||
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
|
||||
if err := m.update(ctx, cfgInit, false); err != nil {
|
||||
t.Fatalf("failed to complete initial rules update: %s", err)
|
||||
}
|
||||
|
||||
path = []string{tc.updatePath}
|
||||
_ = m.update(ctx, path, true, true, false)
|
||||
cfgUpdate, err := config.Parse([]string{tc.updatePath}, true, true)
|
||||
if err == nil { // update can fail and that's expected
|
||||
_ = m.update(ctx, cfgUpdate, false)
|
||||
}
|
||||
if len(tc.want) != len(m.groups) {
|
||||
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
|
||||
}
|
||||
@@ -267,3 +275,12 @@ func TestManagerUpdate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressions bool) []config.Group {
|
||||
t.Helper()
|
||||
cfg, err := config.Parse(path, validateAnnotations, validateExpressions)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
@@ -83,14 +83,16 @@ func TestAlert_ExecTemplate(t *testing.T) {
|
||||
{Name: "foo", Value: "bar"},
|
||||
{Name: "baz", Value: "qux"},
|
||||
},
|
||||
Value: 1,
|
||||
Values: []float64{1},
|
||||
Timestamps: []int64{1},
|
||||
},
|
||||
{
|
||||
Labels: []datasource.Label{
|
||||
{Name: "foo", Value: "garply"},
|
||||
{Name: "baz", Value: "fred"},
|
||||
},
|
||||
Value: 2,
|
||||
Values: []float64{2},
|
||||
Timestamps: []int64{1},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -28,44 +28,73 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
// metric is private copy of datasource.Metric,
|
||||
// it is used for templating annotations,
|
||||
// Labels as map simplifies templates evaluation.
|
||||
type metric struct {
|
||||
Labels map[string]string
|
||||
Timestamp int64
|
||||
Value float64
|
||||
}
|
||||
|
||||
// datasourceMetricsToTemplateMetrics converts Metrics from datasource package to private copy for templating.
|
||||
func datasourceMetricsToTemplateMetrics(ms []datasource.Metric) []metric {
|
||||
mss := make([]metric, 0, len(ms))
|
||||
for _, m := range ms {
|
||||
labelsMap := make(map[string]string, len(m.Labels))
|
||||
for _, labelValue := range m.Labels {
|
||||
labelsMap[labelValue.Name] = labelValue.Value
|
||||
}
|
||||
mss = append(mss, metric{
|
||||
Labels: labelsMap,
|
||||
Timestamp: m.Timestamps[0],
|
||||
Value: m.Values[0]})
|
||||
}
|
||||
return mss
|
||||
}
|
||||
|
||||
// QueryFn is used to wrap a call to datasource into simple-to-use function
|
||||
// for templating functions.
|
||||
type QueryFn func(query string) ([]datasource.Metric, error)
|
||||
|
||||
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
|
||||
fm := make(textTpl.FuncMap)
|
||||
for k, fn := range tmplFunc {
|
||||
fm[k] = fn
|
||||
}
|
||||
fm["query"] = func(q string) ([]datasource.Metric, error) {
|
||||
return query(q)
|
||||
}
|
||||
return fm
|
||||
}
|
||||
|
||||
var tmplFunc textTpl.FuncMap
|
||||
|
||||
// InitTemplateFunc initiates template helper functions
|
||||
func InitTemplateFunc(externalURL *url.URL) {
|
||||
tmplFunc = textTpl.FuncMap{
|
||||
"args": func(args ...interface{}) map[string]interface{} {
|
||||
result := make(map[string]interface{})
|
||||
for i, a := range args {
|
||||
result[fmt.Sprintf("arg%d", i)] = a
|
||||
}
|
||||
return result
|
||||
},
|
||||
/* Strings */
|
||||
|
||||
// reReplaceAll ReplaceAllString returns a copy of src, replacing matches of the Regexp with
|
||||
// the replacement string repl. Inside repl, $ signs are interpreted as in Expand,
|
||||
// so for instance $1 represents the text of the first submatch.
|
||||
// alias for https://golang.org/pkg/regexp/#Regexp.ReplaceAllString
|
||||
"reReplaceAll": func(pattern, repl, text string) string {
|
||||
re := regexp.MustCompile(pattern)
|
||||
return re.ReplaceAllString(text, repl)
|
||||
},
|
||||
"safeHtml": func(text string) htmlTpl.HTML {
|
||||
return htmlTpl.HTML(text)
|
||||
},
|
||||
"match": regexp.MatchString,
|
||||
"title": strings.Title,
|
||||
|
||||
// match reports whether the string s
|
||||
// contains any match of the regular expression pattern.
|
||||
// alias for https://golang.org/pkg/regexp/#MatchString
|
||||
"match": regexp.MatchString,
|
||||
|
||||
// title returns a copy of the string s with all Unicode letters
|
||||
// that begin words mapped to their Unicode title case.
|
||||
// alias for https://golang.org/pkg/strings/#Title
|
||||
"title": strings.Title,
|
||||
|
||||
// toUpper returns s with all Unicode letters mapped to their upper case.
|
||||
// alias for https://golang.org/pkg/strings/#ToUpper
|
||||
"toUpper": strings.ToUpper,
|
||||
|
||||
// toLower returns s with all Unicode letters mapped to their lower case.
|
||||
// alias for https://golang.org/pkg/strings/#ToLower
|
||||
"toLower": strings.ToLower,
|
||||
|
||||
/* Numbers */
|
||||
|
||||
// humanize converts given number to a human readable format
|
||||
// by adding metric prefixes https://en.wikipedia.org/wiki/Metric_prefix
|
||||
"humanize": func(v float64) string {
|
||||
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
@@ -91,6 +120,8 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
}
|
||||
return fmt.Sprintf("%.4g%s", v, prefix)
|
||||
},
|
||||
|
||||
// humanize1024 converts given number to a human readable format with 1024 as base
|
||||
"humanize1024": func(v float64) string {
|
||||
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
@@ -105,6 +136,8 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
}
|
||||
return fmt.Sprintf("%.4g%s", v, prefix)
|
||||
},
|
||||
|
||||
// humanizeDuration converts given seconds to a human readable duration
|
||||
"humanizeDuration": func(v float64) string {
|
||||
if math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
@@ -145,9 +178,13 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
}
|
||||
return fmt.Sprintf("%.4g%ss", v, prefix)
|
||||
},
|
||||
|
||||
// humanizePercentage converts given ratio value to a fraction of 100
|
||||
"humanizePercentage": func(v float64) string {
|
||||
return fmt.Sprintf("%.4g%%", v*100)
|
||||
},
|
||||
|
||||
// humanizeTimestamp converts given timestamp to a human readable time equivalent
|
||||
"humanizeTimestamp": func(v float64) string {
|
||||
if math.IsNaN(v) || math.IsInf(v, 0) {
|
||||
return fmt.Sprintf("%.4g", v)
|
||||
@@ -155,48 +192,115 @@ func InitTemplateFunc(externalURL *url.URL) {
|
||||
t := TimeFromUnixNano(int64(v * 1e9)).Time().UTC()
|
||||
return fmt.Sprint(t)
|
||||
},
|
||||
"pathPrefix": func() string {
|
||||
return externalURL.Path
|
||||
},
|
||||
|
||||
/* URLs */
|
||||
|
||||
// externalURL returns value of `external.url` flag
|
||||
"externalURL": func() string {
|
||||
return externalURL.String()
|
||||
},
|
||||
|
||||
// pathPrefix returns a Path segment from the URL value in `external.url` flag
|
||||
"pathPrefix": func() string {
|
||||
return externalURL.Path
|
||||
},
|
||||
|
||||
// pathEscape escapes the string so it can be safely placed inside a URL path segment,
|
||||
// replacing special characters (including /) with %XX sequences as needed.
|
||||
// alias for https://golang.org/pkg/net/url/#PathEscape
|
||||
"pathEscape": func(u string) string {
|
||||
return url.PathEscape(u)
|
||||
},
|
||||
|
||||
// queryEscape escapes the string so it can be safely placed
|
||||
// inside a URL query.
|
||||
// alias for https://golang.org/pkg/net/url/#QueryEscape
|
||||
"queryEscape": func(q string) string {
|
||||
return url.QueryEscape(q)
|
||||
},
|
||||
|
||||
// crlfEscape replaces new line chars to skip URL encoding.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/890
|
||||
"crlfEscape": func(q string) string {
|
||||
q = strings.Replace(q, "\n", `\n`, -1)
|
||||
return strings.Replace(q, "\r", `\r`, -1)
|
||||
},
|
||||
|
||||
// quotesEscape escapes quote char
|
||||
"quotesEscape": func(q string) string {
|
||||
return strings.Replace(q, `"`, `\"`, -1)
|
||||
},
|
||||
// query function supposed to be substituted at funcsWithQuery().
|
||||
// it is present here only for validation purposes, when there is no
|
||||
// provided datasource.
|
||||
"query": func(q string) ([]datasource.Metric, error) {
|
||||
|
||||
// query executes the MetricsQL/PromQL query against
|
||||
// configured `datasource.url` address.
|
||||
// For example, {{ query "foo" | first | value }} will
|
||||
// execute "/api/v1/query?query=foo" request and will return
|
||||
// the first value in response.
|
||||
"query": func(q string) ([]metric, error) {
|
||||
// query function supposed to be substituted at funcsWithQuery().
|
||||
// it is present here only for validation purposes, when there is no
|
||||
// provided datasource.
|
||||
//
|
||||
// return non-empty slice to pass validation with chained functions in template
|
||||
// see issue #989 for details
|
||||
return []datasource.Metric{{}}, nil
|
||||
return []metric{{}}, nil
|
||||
},
|
||||
"first": func(metrics []datasource.Metric) (datasource.Metric, error) {
|
||||
|
||||
// first returns the first by order element from the given metrics list.
|
||||
// usually used alongside with `query` template function.
|
||||
"first": func(metrics []metric) (metric, error) {
|
||||
if len(metrics) > 0 {
|
||||
return metrics[0], nil
|
||||
}
|
||||
return datasource.Metric{}, errors.New("first() called on vector with no elements")
|
||||
return metric{}, errors.New("first() called on vector with no elements")
|
||||
},
|
||||
"label": func(label string, m datasource.Metric) string {
|
||||
return m.Label(label)
|
||||
|
||||
// label returns the value of the given label name for the given metric.
|
||||
// usually used alongside with `query` template function.
|
||||
"label": func(label string, m metric) string {
|
||||
return m.Labels[label]
|
||||
},
|
||||
"value": func(m datasource.Metric) float64 {
|
||||
|
||||
// value returns the value of the given metric.
|
||||
// usually used alongside with `query` template function.
|
||||
"value": func(m metric) float64 {
|
||||
return m.Value
|
||||
},
|
||||
|
||||
/* Helpers */
|
||||
|
||||
// Converts a list of objects to a map with keys arg0, arg1 etc.
|
||||
// This is intended to allow multiple arguments to be passed to templates.
|
||||
"args": func(args ...interface{}) map[string]interface{} {
|
||||
result := make(map[string]interface{})
|
||||
for i, a := range args {
|
||||
result[fmt.Sprintf("arg%d", i)] = a
|
||||
}
|
||||
return result
|
||||
},
|
||||
|
||||
// safeHtml marks string as HTML not requiring auto-escaping.
|
||||
"safeHtml": func(text string) htmlTpl.HTML {
|
||||
return htmlTpl.HTML(text)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
|
||||
fm := make(textTpl.FuncMap)
|
||||
for k, fn := range tmplFunc {
|
||||
fm[k] = fn
|
||||
}
|
||||
fm["query"] = func(q string) ([]metric, error) {
|
||||
result, err := query(q)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return datasourceMetricsToTemplateMetrics(result), nil
|
||||
}
|
||||
return fm
|
||||
}
|
||||
|
||||
// Time is the number of milliseconds since the epoch
|
||||
// (1970-01-01 00:00 UTC) excluding leap seconds.
|
||||
type Time int64
|
||||
|
||||
@@ -3,8 +3,8 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -25,6 +25,8 @@ type RecordingRule struct {
|
||||
Labels map[string]string
|
||||
GroupID uint64
|
||||
|
||||
q datasource.Querier
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
@@ -52,7 +54,7 @@ func (rr *RecordingRule) ID() uint64 {
|
||||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
|
||||
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
||||
rr := &RecordingRule{
|
||||
Type: cfg.Type,
|
||||
RuleID: cfg.ID,
|
||||
@@ -61,6 +63,11 @@ func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
|
||||
Labels: cfg.Labels,
|
||||
GroupID: group.ID(),
|
||||
metrics: &recordingRuleMetrics{},
|
||||
q: qb.BuildWithParams(datasource.QuerierParams{
|
||||
DataSourceType: &cfg.Type,
|
||||
EvaluationInterval: group.Interval,
|
||||
ExtraLabels: group.ExtraFilterLabels,
|
||||
}),
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||
@@ -81,13 +88,31 @@ func (rr *RecordingRule) Close() {
|
||||
metrics.UnregisterMetric(rr.metrics.errors.name)
|
||||
}
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
|
||||
if !series {
|
||||
return nil, nil
|
||||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
series, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
duplicates := make(map[string]struct{}, len(series))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, s := range series {
|
||||
ts := rr.toTimeSeries(s)
|
||||
key := stringifyLabels(ts)
|
||||
if _, ok := duplicates[key]; ok {
|
||||
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", s.Labels, key, errDuplicate)
|
||||
}
|
||||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
qMetrics, err := q.Query(ctx, rr.Expr, rr.Type)
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
|
||||
qMetrics, err := rr.q.Query(ctx, rr.Expr)
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
|
||||
@@ -97,36 +122,41 @@ func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
}
|
||||
|
||||
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
|
||||
duplicates := make(map[string]struct{}, len(qMetrics))
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, r := range qMetrics {
|
||||
ts := rr.toTimeSeries(r, rr.lastExecTime)
|
||||
h := hashTimeSeries(ts)
|
||||
if _, ok := duplicates[h]; ok {
|
||||
ts := rr.toTimeSeries(r)
|
||||
key := stringifyLabels(ts)
|
||||
if _, ok := duplicates[key]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, errDuplicate
|
||||
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
}
|
||||
duplicates[h] = ts
|
||||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
|
||||
hash := fnv.New64a()
|
||||
func stringifyLabels(ts prompbmarshal.TimeSeries) string {
|
||||
labels := ts.Labels
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
for _, l := range labels {
|
||||
hash.Write([]byte(l.Name))
|
||||
hash.Write([]byte(l.Value))
|
||||
hash.Write([]byte("\xff"))
|
||||
if len(labels) > 1 {
|
||||
sort.Slice(labels, func(i, j int) bool {
|
||||
return labels[i].Name < labels[j].Name
|
||||
})
|
||||
}
|
||||
return hash.Sum64()
|
||||
b := strings.Builder{}
|
||||
for i, l := range labels {
|
||||
b.WriteString(l.Name)
|
||||
b.WriteString("=")
|
||||
b.WriteString(l.Value)
|
||||
if i != len(labels)-1 {
|
||||
b.WriteString(",")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSeries {
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.Labels {
|
||||
labels[l.Name] = l.Value
|
||||
@@ -136,12 +166,10 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time)
|
||||
for k, v := range rr.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Value, labels, timestamp)
|
||||
return newTimeSeries(m.Values, m.Timestamps, labels)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
@@ -149,6 +177,7 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
rr.q = nr.q
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
func TestRecoridngRule_Exec(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
@@ -24,9 +24,9 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(10, map[string]string{
|
||||
newTimeSeries([]float64{10}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foo",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -37,18 +37,18 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(1, map[string]string{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}, timestamp),
|
||||
newTimeSeries(2, map[string]string{
|
||||
}),
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}, timestamp),
|
||||
newTimeSeries(3, map[string]string{
|
||||
}),
|
||||
newTimeSeries([]float64{3}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -59,16 +59,16 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries(2, map[string]string{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
newTimeSeries(1, map[string]string{
|
||||
}),
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}, timestamp),
|
||||
}),
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -76,7 +76,8 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tss, err := tc.rule.Exec(context.TODO(), fq, true)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.Exec(context.TODO())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
@@ -87,7 +88,88 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
func TestRecoridngRule_ExecRange(t *testing.T) {
|
||||
timestamp := time.Now()
|
||||
testCases := []struct {
|
||||
rule *RecordingRule
|
||||
metrics []datasource.Metric
|
||||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
[]datasource.Metric{metricWithValuesAndLabels(t, []float64{10, 20, 30},
|
||||
"__name__", "bar",
|
||||
)},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{10, 20, 30},
|
||||
[]int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()},
|
||||
map[string]string{
|
||||
"__name__": "foo",
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
[]datasource.Metric{
|
||||
metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"),
|
||||
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
|
||||
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "foo",
|
||||
}),
|
||||
newTimeSeries([]float64{2, 3}, []int64{timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "bar",
|
||||
}),
|
||||
newTimeSeries([]float64{4, 5, 6},
|
||||
[]int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()},
|
||||
map[string]string{
|
||||
"__name__": "foobarbaz",
|
||||
"job": "baz",
|
||||
}),
|
||||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "foo",
|
||||
"source": "test",
|
||||
}),
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
}),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
|
||||
t.Fatalf("timeseries missmatch: %s", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoridngRule_ExecNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
@@ -95,8 +177,8 @@ func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
|
||||
_, err := rr.Exec(context.TODO(), fq, true)
|
||||
rr.q = fq
|
||||
_, err := rr.Exec(context.TODO())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
@@ -111,7 +193,7 @@ func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), fq, true)
|
||||
_, err = rr.Exec(context.TODO())
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
|
||||
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
|
||||
" E.g. http://127.0.0.1:8428")
|
||||
addr = flag.String("remoteRead.url", "", "Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts "+
|
||||
"state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state. "+
|
||||
"E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
|
||||
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
|
||||
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
|
||||
@@ -26,7 +26,7 @@ var (
|
||||
|
||||
// Init creates a Querier from provided flag values.
|
||||
// Returns nil if addr flag wasn't set.
|
||||
func Init() (datasource.Querier, error) {
|
||||
func Init() (datasource.QuerierBuilder, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -10,8 +10,8 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
|
||||
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
addr = flag.String("remoteWrite.url", "", "Optional URL to VictoriaMetrics or vminsert where to persist alerts state "+
|
||||
"and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
|
||||
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
|
||||
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
|
||||
|
||||
|
||||
160
app/vmalert/replay.go
Normal file
160
app/vmalert/replay.go
Normal file
@@ -0,0 +1,160 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
var (
|
||||
replayFrom = flag.String("replay.timeFrom", "",
|
||||
"The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'")
|
||||
replayTo = flag.String("replay.timeTo", "",
|
||||
"The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'")
|
||||
replayRulesDelay = flag.Duration("replay.rulesDelay", time.Second,
|
||||
"Delay between rules evaluation within the group. Could be important if there are chained rules inside of the group"+
|
||||
"and processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule."+
|
||||
"Keep it equal or bigger than -remoteWrite.flushInterval.")
|
||||
replayMaxDatapoints = flag.Int("replay.maxDatapointsPerQuery", 1e3,
|
||||
"Max number of data points expected in one request. The higher the value, the less requests will be made during replay.")
|
||||
replayRuleRetryAttempts = flag.Int("replay.ruleRetryAttempts", 5,
|
||||
"Defines how many retries to make before giving up on rule if request for it returns an error.")
|
||||
)
|
||||
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
|
||||
if *replayMaxDatapoints < 1 {
|
||||
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
|
||||
}
|
||||
tFrom, err := time.Parse(time.RFC3339, *replayFrom)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse %q: %s", *replayFrom, err)
|
||||
}
|
||||
tTo, err := time.Parse(time.RFC3339, *replayTo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse %q: %s", *replayTo, err)
|
||||
}
|
||||
if !tTo.After(tFrom) {
|
||||
return fmt.Errorf("replay.timeTo must be bigger than replay.timeFrom")
|
||||
}
|
||||
labels := make(map[string]string)
|
||||
for _, s := range *externalLabels {
|
||||
if len(s) == 0 {
|
||||
continue
|
||||
}
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
return fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
|
||||
fmt.Printf("Replay mode:"+
|
||||
"\nfrom: \t%v "+
|
||||
"\nto: \t%v "+
|
||||
"\nmax data points per request: %d\n",
|
||||
tFrom, tTo, *replayMaxDatapoints)
|
||||
|
||||
var total int
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, qb, *evaluationInterval, labels)
|
||||
total += ng.replay(tFrom, tTo, rw)
|
||||
}
|
||||
logger.Infof("replay finished! Imported %d samples", total)
|
||||
if rw != nil {
|
||||
return rw.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(*replayMaxDatapoints)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
iterations := int(end.Sub(start)/step) + 1
|
||||
fmt.Printf("\nGroup %q"+
|
||||
"\ninterval: \t%v"+
|
||||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, iterations, step)
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
bar := pb.StartNew(iterations)
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
total += n
|
||||
bar.Increment()
|
||||
}
|
||||
bar.Finish()
|
||||
// sleep to let remote storage to flush data on-disk
|
||||
// so chained rules could be calculated correctly
|
||||
time.Sleep(*replayRulesDelay)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < *replayRuleRetryAttempts; i++ {
|
||||
tss, err = rule.ExecRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
if err != nil { // means all attempts failed
|
||||
return 0, err
|
||||
}
|
||||
if len(tss) < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
var n int
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
return n, fmt.Errorf("remote write failure: %s", err)
|
||||
}
|
||||
n += len(ts.Samples)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
type rangeIterator struct {
|
||||
step time.Duration
|
||||
start, end time.Time
|
||||
|
||||
iter int
|
||||
s, e time.Time
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) reset() {
|
||||
ri.iter = 0
|
||||
ri.s, ri.e = time.Time{}, time.Time{}
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) next() bool {
|
||||
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
|
||||
if !ri.end.After(ri.s) {
|
||||
return false
|
||||
}
|
||||
ri.e = ri.s.Add(ri.step)
|
||||
if ri.e.After(ri.end) {
|
||||
ri.e = ri.end
|
||||
}
|
||||
ri.iter++
|
||||
return true
|
||||
}
|
||||
250
app/vmalert/replay_test.go
Normal file
250
app/vmalert/replay_test.go
Normal file
@@ -0,0 +1,250 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
)
|
||||
|
||||
type fakeReplayQuerier struct {
|
||||
fakeQuerier
|
||||
registry map[string]map[string]struct{}
|
||||
}
|
||||
|
||||
func (fr *fakeReplayQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
|
||||
return fr
|
||||
}
|
||||
|
||||
func (fr *fakeReplayQuerier) QueryRange(_ context.Context, q string, from, to time.Time) ([]datasource.Metric, error) {
|
||||
key := fmt.Sprintf("%s+%s", from.Format("15:04:05"), to.Format("15:04:05"))
|
||||
dps, ok := fr.registry[q]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unexpected query received: %q", q)
|
||||
}
|
||||
_, ok = dps[key]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unexpected time range received: %q", key)
|
||||
}
|
||||
delete(dps, key)
|
||||
if len(fr.registry[q]) < 1 {
|
||||
delete(fr.registry, q)
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func TestReplay(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
from, to string
|
||||
maxDP int
|
||||
cfg []config.Group
|
||||
qb *fakeReplayQuerier
|
||||
}{
|
||||
{
|
||||
name: "one rule + one response",
|
||||
from: "2021-01-01T12:00:00.000Z",
|
||||
to: "2021-01-01T12:02:00.000Z",
|
||||
maxDP: 10,
|
||||
cfg: []config.Group{
|
||||
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
"sum(up)": {"12:00:00+12:02:00": {}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "one rule + multiple responses",
|
||||
from: "2021-01-01T12:00:00.000Z",
|
||||
to: "2021-01-01T12:02:30.000Z",
|
||||
maxDP: 1,
|
||||
cfg: []config.Group{
|
||||
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
"sum(up)": {
|
||||
"12:00:00+12:01:00": {},
|
||||
"12:01:00+12:02:00": {},
|
||||
"12:02:00+12:02:30": {},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "datapoints per step",
|
||||
from: "2021-01-01T12:00:00.000Z",
|
||||
to: "2021-01-01T15:02:30.000Z",
|
||||
maxDP: 60,
|
||||
cfg: []config.Group{
|
||||
{Interval: utils.NewPromDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
"sum(up)": {
|
||||
"12:00:00+13:00:00": {},
|
||||
"13:00:00+14:00:00": {},
|
||||
"14:00:00+15:00:00": {},
|
||||
"15:00:00+15:02:30": {},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple recording rules + multiple responses",
|
||||
from: "2021-01-01T12:00:00.000Z",
|
||||
to: "2021-01-01T12:02:30.000Z",
|
||||
maxDP: 1,
|
||||
cfg: []config.Group{
|
||||
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
|
||||
{Rules: []config.Rule{{Record: "bar", Expr: "max(up)"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
"sum(up)": {
|
||||
"12:00:00+12:01:00": {},
|
||||
"12:01:00+12:02:00": {},
|
||||
"12:02:00+12:02:30": {},
|
||||
},
|
||||
"max(up)": {
|
||||
"12:00:00+12:01:00": {},
|
||||
"12:01:00+12:02:00": {},
|
||||
"12:02:00+12:02:30": {},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple alerting rules + multiple responses",
|
||||
from: "2021-01-01T12:00:00.000Z",
|
||||
to: "2021-01-01T12:02:30.000Z",
|
||||
maxDP: 1,
|
||||
cfg: []config.Group{
|
||||
{Rules: []config.Rule{{Alert: "foo", Expr: "sum(up) > 1"}}},
|
||||
{Rules: []config.Rule{{Alert: "bar", Expr: "max(up) < 1"}}},
|
||||
},
|
||||
qb: &fakeReplayQuerier{
|
||||
registry: map[string]map[string]struct{}{
|
||||
"sum(up) > 1": {
|
||||
"12:00:00+12:01:00": {},
|
||||
"12:01:00+12:02:00": {},
|
||||
"12:02:00+12:02:30": {},
|
||||
},
|
||||
"max(up) < 1": {
|
||||
"12:00:00+12:01:00": {},
|
||||
"12:01:00+12:02:00": {},
|
||||
"12:02:00+12:02:30": {},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
from, to, maxDP := *replayFrom, *replayTo, *replayMaxDatapoints
|
||||
retries, delay := *replayRuleRetryAttempts, *replayRulesDelay
|
||||
defer func() {
|
||||
*replayFrom, *replayTo = from, to
|
||||
*replayMaxDatapoints, *replayRuleRetryAttempts = maxDP, retries
|
||||
*replayRulesDelay = delay
|
||||
}()
|
||||
|
||||
*replayRuleRetryAttempts = 1
|
||||
*replayRulesDelay = time.Millisecond
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
*replayFrom = tc.from
|
||||
*replayTo = tc.to
|
||||
*replayMaxDatapoints = tc.maxDP
|
||||
if err := replay(tc.cfg, tc.qb, nil); err != nil {
|
||||
t.Fatalf("replay failed: %s", err)
|
||||
}
|
||||
if len(tc.qb.registry) > 0 {
|
||||
t.Fatalf("not all requests were sent: %#v", tc.qb.registry)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeIterator(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ri rangeIterator
|
||||
result [][2]time.Time
|
||||
}{
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 5 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 45 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
|
||||
step: time.Second,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
var j int
|
||||
for tc.ri.next() {
|
||||
if len(tc.result) < j+1 {
|
||||
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
|
||||
j, tc.ri.s, tc.ri.e)
|
||||
}
|
||||
s, e := tc.ri.s, tc.ri.e
|
||||
expS, expE := tc.result[j][0], tc.result[j][1]
|
||||
if s != expS {
|
||||
t.Fatalf("expected to get start=%v; got %v", expS, s)
|
||||
}
|
||||
if e != expE {
|
||||
t.Fatalf("expected to get end=%v; got %v", expE, e)
|
||||
}
|
||||
j++
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func parseTime(t *testing.T, s string) time.Time {
|
||||
t.Helper()
|
||||
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return tt
|
||||
}
|
||||
@@ -3,22 +3,21 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// Returns unique ID that may be used for
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context
|
||||
// and Querier. If returnSeries is true, Exec
|
||||
// may return TimeSeries as result of execution
|
||||
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
|
||||
Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range
|
||||
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
|
||||
@@ -7,17 +7,21 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{}
|
||||
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
||||
Value: value,
|
||||
Timestamp: timestamp.UnixNano() / 1e6,
|
||||
})
|
||||
func newTimeSeries(values []float64, timestamps []int64, labels map[string]string) prompbmarshal.TimeSeries {
|
||||
ts := prompbmarshal.TimeSeries{
|
||||
Samples: make([]prompbmarshal.Sample, len(values)),
|
||||
}
|
||||
for i := range values {
|
||||
ts.Samples[i] = prompbmarshal.Sample{
|
||||
Value: values[i],
|
||||
Timestamp: time.Unix(timestamps[i], 0).UnixNano() / 1e6,
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(labels))
|
||||
for k := range labels {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
sort.Strings(keys) // make order deterministic
|
||||
for _, key := range keys {
|
||||
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
||||
Name: key,
|
||||
|
||||
43
app/vmalert/utils/prom_duration.go
Normal file
43
app/vmalert/utils/prom_duration.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// PromDuration is Prometheus duration.
|
||||
type PromDuration struct {
|
||||
milliseconds int64
|
||||
}
|
||||
|
||||
// NewPromDuration returns PromDuration for given d.
|
||||
func NewPromDuration(d time.Duration) PromDuration {
|
||||
return PromDuration{
|
||||
milliseconds: d.Milliseconds(),
|
||||
}
|
||||
}
|
||||
|
||||
// MarshalYAML implements yaml.Marshaler interface.
|
||||
func (pd PromDuration) MarshalYAML() (interface{}, error) {
|
||||
return pd.Duration().String(), nil
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements yaml.Unmarshaler interface.
|
||||
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
var s string
|
||||
if err := unmarshal(&s); err != nil {
|
||||
return err
|
||||
}
|
||||
ms, err := metricsql.DurationValue(s, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pd.milliseconds = ms
|
||||
return nil
|
||||
}
|
||||
|
||||
// Duration returns duration for pd.
|
||||
func (pd *PromDuration) Duration() time.Duration {
|
||||
return time.Duration(pd.milliseconds) * time.Millisecond
|
||||
}
|
||||
@@ -13,6 +13,7 @@ func TestTLSConfig(t *testing.T) {
|
||||
}
|
||||
if tlsCfg == nil {
|
||||
t.Errorf("expected tlsConfig to be set, got nil")
|
||||
return
|
||||
}
|
||||
if tlsCfg.ServerName != serverName {
|
||||
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
|
||||
|
||||
@@ -17,27 +17,24 @@ type requestHandler struct {
|
||||
m *manager
|
||||
}
|
||||
|
||||
var pathList = [][]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
// /metrics is served by httpserver by default
|
||||
{"/metrics", "list of application metrics"},
|
||||
{"/-/reload", "reload configuration"},
|
||||
}
|
||||
|
||||
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
switch r.URL.Path {
|
||||
case "/":
|
||||
for _, path := range pathList {
|
||||
p, doc := path[0], path[1]
|
||||
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
|
||||
if r.Method != "GET" {
|
||||
return false
|
||||
}
|
||||
httpserver.WriteAPIHelp(w, [][2]string{
|
||||
{"/api/v1/groups", "list all loaded groups and rules"},
|
||||
{"/api/v1/alerts", "list all active alerts"},
|
||||
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
|
||||
{"/metrics", "list of application metrics"},
|
||||
{"/-/reload", "reload configuration"},
|
||||
})
|
||||
return true
|
||||
case "/api/v1/groups":
|
||||
data, err := rh.listGroups()
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
@@ -46,7 +43,7 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
case "/api/v1/alerts":
|
||||
data, err := rh.listAlerts()
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
@@ -64,7 +61,7 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
||||
// /api/v1/<groupName>/<alertID>/status
|
||||
data, err := rh.alert(r.URL.Path)
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
|
||||
@@ -20,14 +20,15 @@ type APIAlert struct {
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
type APIGroup struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
ID string `json:"id"`
|
||||
File string `json:"file"`
|
||||
Interval string `json:"interval"`
|
||||
Concurrency int `json:"concurrency"`
|
||||
ExtraFilterLabels map[string]string `json:"extra_filter_labels"`
|
||||
AlertingRules []APIAlertingRule `json:"alerting_rules"`
|
||||
RecordingRules []APIRecordingRule `json:"recording_rules"`
|
||||
}
|
||||
|
||||
// APIAlertingRule represents AlertingRule for WEB view
|
||||
|
||||
@@ -77,3 +77,9 @@ vmauth-local-with-goarch:
|
||||
|
||||
vmauth-pure:
|
||||
APP_NAME=vmauth $(MAKE) app-local-pure
|
||||
|
||||
vmauth-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmauth $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmauth-windows-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
## vmauth
|
||||
# vmauth
|
||||
|
||||
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
|
||||
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
`vmauth` is a simple auth proxy, router and [load balancer](#load-balancing) for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It reads auth credentials from `Authorization` http header ([Basic Auth](https://en.wikipedia.org/wiki/Basic_access_authentication) and `Bearer token` is supported),
|
||||
matches them against configs pointed by [-auth.config](#auth-config) command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
|
||||
|
||||
|
||||
## Quick start
|
||||
@@ -17,18 +17,24 @@ and pass the following flag to `vmauth` binary in order to start authorizing and
|
||||
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
|
||||
The port can be modified via `-httpListenAddr` command-line flag.
|
||||
|
||||
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
|
||||
The auth config can be reloaded either by passing `SIGHUP` signal to `vmauth` or by querying `/-/reload` http endpoint.
|
||||
|
||||
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
|
||||
|
||||
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
|
||||
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
|
||||
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML,
|
||||
accounting and rate limiting such as [vmgateway](https://docs.victoriametrics.com/vmgateway.html).
|
||||
|
||||
|
||||
## Load balancing
|
||||
|
||||
Each `url_prefix` in the [-auth.config](#auth-config) may contain either a single url or a list of urls. In the latter case `vmauth` balances load among the configured urls in a round-robin manner. This feature is useful for balancing the load among multiple `vmselect` and/or `vminsert` nodes in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
|
||||
|
||||
|
||||
## Auth config
|
||||
|
||||
Auth config is represented in the following simple `yml` format:
|
||||
`-auth.config` is represented in the following simple `yml` format:
|
||||
|
||||
```yml
|
||||
|
||||
@@ -36,43 +42,71 @@ Auth config is represented in the following simple `yml` format:
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
- bearer_token: "XXXX"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
# will be proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
|
||||
# The user for querying local single-node VictoriaMetrics with extra_label team=dev.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
# will be routed to http://localhost:8428 with extra_label=team=dev query arg.
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query?extra_label=team=dev
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428?extra_label=team=dev"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be load-balanced among http://vmselect1:8481/select/123/prometheus and http://vmselect2:8481/select/123/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to the following urls in a round-robin manner:
|
||||
# - http://vmselect1:8481/select/123/prometheus/api/v1/select
|
||||
# - http://vmselect2:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
url_prefix:
|
||||
- "http://vmselect1:8481/select/123/prometheus"
|
||||
- "http://vmselect2:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
|
||||
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
# will be load-balanced between http://vminsert1:8480/insert/42/prometheus and http://vminsert2:8480/insert/42/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/write is proxied to the following urls in a round-robin manner:
|
||||
# - http://vminsert1:8480/insert/42/prometheus/api/v1/write
|
||||
# - http://vminsert2:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
url_prefix:
|
||||
- "http://vminsert1:8480/insert/42/prometheus"
|
||||
- "http://vminsert2:8480/insert/42/prometheus"
|
||||
|
||||
|
||||
# A single user for querying and inserting data:
|
||||
# - Requests to http://vmauth:8427/api/v1/query or http://vmauth:8427/api/v1/query_range
|
||||
# are routed to http://vmselect:8481/select/42/prometheus.
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8480/select/42/prometheus/api/v1/query
|
||||
# - Requests to http://vmauth:8427/api/v1/write are routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
|
||||
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
|
||||
# - http://vmselect1:8481/select/42/prometheus
|
||||
# - http://vmselect2:8481/select/42/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
|
||||
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
|
||||
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "foobar"
|
||||
url_map:
|
||||
- src_paths: ["/api/v1/query", "/api/v1/query_range"]
|
||||
url_prefix: "http://vmselect:8481/select/42/prometheus"
|
||||
- src_paths:
|
||||
- "/api/v1/query"
|
||||
- "/api/v1/query_range"
|
||||
- "/api/v1/label/[^/]+/values"
|
||||
url_prefix:
|
||||
- "http://vmselect1:8481/select/42/prometheus"
|
||||
- "http://vmselect2:8481/select/42/prometheus"
|
||||
- src_paths: ["/api/v1/write"]
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
```
|
||||
@@ -96,11 +130,13 @@ Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable
|
||||
|
||||
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
|
||||
|
||||
It is recommended protecting `/-/reload` endpoint with `-reloadAuthKey` command-line flag, so external users couldn't trigger config reload.
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
|
||||
either via [vmagent](https://victoriametrics.github.io/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
|
||||
either via [vmagent](https://docs.victoriametrics.com/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
|
||||
|
||||
|
||||
## How to build from sources
|
||||
@@ -110,14 +146,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmauth` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmauth` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmauth` binary and puts it into the `bin` folder.
|
||||
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmauth-prod` from the root folder of the repository.
|
||||
2. Run `make vmauth-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmauth-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
### Building docker images
|
||||
@@ -164,12 +200,12 @@ Pass `-help` command-line arg to `vmauth` in order to see all the configuration
|
||||
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vmauth.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmauth.html .
|
||||
|
||||
-auth.config string
|
||||
Path to auth config. See https://victoriametrics.github.io/vmauth.html for details on the format of this auth config
|
||||
Path to auth config. See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
@@ -177,17 +213,17 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
@@ -197,7 +233,7 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -207,20 +243,24 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
|
||||
-memory.allowedBytes value
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxIdleConnsPerBackend int
|
||||
The maximum number of idle connections vmauth can open per each backend host (default 100)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It overrides httpAuth settings
|
||||
-reloadAuthKey string
|
||||
Auth key for /-/reload http endpoint. It must be passed as authKey=...
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@@ -17,7 +21,7 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://victoriametrics.github.io/vmauth.html "+
|
||||
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://docs.victoriametrics.com/vmauth.html "+
|
||||
"for details on the format of this auth config")
|
||||
)
|
||||
|
||||
@@ -28,24 +32,140 @@ type AuthConfig struct {
|
||||
|
||||
// UserInfo is user information read from authConfigPath
|
||||
type UserInfo struct {
|
||||
Username string `yaml:"username"`
|
||||
Password string `yaml:"password"`
|
||||
URLPrefix string `yaml:"url_prefix"`
|
||||
URLMap []URLMap `yaml:"url_map"`
|
||||
BearerToken string `yaml:"bearer_token"`
|
||||
Username string `yaml:"username"`
|
||||
Password string `yaml:"password"`
|
||||
URLPrefix *URLPrefix `yaml:"url_prefix"`
|
||||
URLMap []URLMap `yaml:"url_map"`
|
||||
|
||||
requests *metrics.Counter
|
||||
}
|
||||
|
||||
// URLMap is a mapping from source paths to target urls.
|
||||
type URLMap struct {
|
||||
SrcPaths []string `yaml:"src_paths"`
|
||||
URLPrefix string `yaml:"url_prefix"`
|
||||
SrcPaths []*SrcPath `yaml:"src_paths"`
|
||||
URLPrefix *URLPrefix `yaml:"url_prefix"`
|
||||
}
|
||||
|
||||
// SrcPath represents an src path
|
||||
type SrcPath struct {
|
||||
sOriginal string
|
||||
re *regexp.Regexp
|
||||
}
|
||||
|
||||
// URLPrefix represents pased `url_prefix`
|
||||
type URLPrefix struct {
|
||||
n uint32
|
||||
urls []*url.URL
|
||||
}
|
||||
|
||||
func (up *URLPrefix) getNextURL() *url.URL {
|
||||
n := atomic.AddUint32(&up.n, 1)
|
||||
idx := n % uint32(len(up.urls))
|
||||
return up.urls[idx]
|
||||
}
|
||||
|
||||
// UnmarshalYAML unmarshals up from yaml.
|
||||
func (up *URLPrefix) UnmarshalYAML(f func(interface{}) error) error {
|
||||
var v interface{}
|
||||
if err := f(&v); err != nil {
|
||||
return err
|
||||
}
|
||||
var urls []string
|
||||
switch x := v.(type) {
|
||||
case string:
|
||||
urls = []string{x}
|
||||
case []interface{}:
|
||||
if len(x) == 0 {
|
||||
return fmt.Errorf("`url_prefix` must contain at least a single url")
|
||||
}
|
||||
us := make([]string, len(x))
|
||||
for i, xx := range x {
|
||||
s, ok := xx.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("`url_prefix` must contain array of strings; got %T", xx)
|
||||
}
|
||||
us[i] = s
|
||||
}
|
||||
urls = us
|
||||
default:
|
||||
return fmt.Errorf("unexpected type for `url_prefix`: %T; want string or []string", v)
|
||||
}
|
||||
pus := make([]*url.URL, len(urls))
|
||||
for i, u := range urls {
|
||||
pu, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot unmarshal %q into url: %w", u, err)
|
||||
}
|
||||
pus[i] = pu
|
||||
}
|
||||
up.urls = pus
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarshalYAML marshals up to yaml.
|
||||
func (up *URLPrefix) MarshalYAML() (interface{}, error) {
|
||||
var b []byte
|
||||
if len(up.urls) == 1 {
|
||||
u := up.urls[0].String()
|
||||
b = strconv.AppendQuote(b, u)
|
||||
return string(b), nil
|
||||
}
|
||||
b = append(b, '[')
|
||||
for i, pu := range up.urls {
|
||||
u := pu.String()
|
||||
b = strconv.AppendQuote(b, u)
|
||||
if i+1 < len(up.urls) {
|
||||
b = append(b, ',')
|
||||
}
|
||||
}
|
||||
b = append(b, ']')
|
||||
return string(b), nil
|
||||
}
|
||||
|
||||
func (sp *SrcPath) match(s string) bool {
|
||||
prefix, ok := sp.re.LiteralPrefix()
|
||||
if ok {
|
||||
// Fast path - literal match
|
||||
return s == prefix
|
||||
}
|
||||
if !strings.HasPrefix(s, prefix) {
|
||||
return false
|
||||
}
|
||||
return sp.re.MatchString(s)
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements yaml.Unmarshaler
|
||||
func (sp *SrcPath) UnmarshalYAML(f func(interface{}) error) error {
|
||||
var s string
|
||||
if err := f(&s); err != nil {
|
||||
return err
|
||||
}
|
||||
sAnchored := "^(?:" + s + ")$"
|
||||
re, err := regexp.Compile(sAnchored)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot build regexp from %q: %w", s, err)
|
||||
}
|
||||
sp.sOriginal = s
|
||||
sp.re = re
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarshalYAML implements yaml.Marshaler.
|
||||
func (sp *SrcPath) MarshalYAML() (interface{}, error) {
|
||||
return sp.sOriginal, nil
|
||||
}
|
||||
|
||||
func initAuthConfig() {
|
||||
if len(*authConfigPath) == 0 {
|
||||
logger.Fatalf("missing required `-auth.config` command-line flag")
|
||||
}
|
||||
|
||||
// Register SIGHUP handler for config re-read just before readAuthConfig call.
|
||||
// This guarantees that the config will be re-read if the signal arrives during readAuthConfig call.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
|
||||
m, err := readAuthConfig(*authConfigPath)
|
||||
if err != nil {
|
||||
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
|
||||
@@ -55,7 +175,7 @@ func initAuthConfig() {
|
||||
authConfigWG.Add(1)
|
||||
go func() {
|
||||
defer authConfigWG.Done()
|
||||
authConfigReloader()
|
||||
authConfigReloader(sighupCh)
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -64,8 +184,7 @@ func stopAuthConfig() {
|
||||
authConfigWG.Wait()
|
||||
}
|
||||
|
||||
func authConfigReloader() {
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
func authConfigReloader(sighupCh <-chan os.Signal) {
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
@@ -110,58 +229,93 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
|
||||
if len(uis) == 0 {
|
||||
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
|
||||
}
|
||||
m := make(map[string]*UserInfo, len(uis))
|
||||
byAuthToken := make(map[string]*UserInfo, len(uis))
|
||||
byUsername := make(map[string]bool, len(uis))
|
||||
byBearerToken := make(map[string]bool, len(uis))
|
||||
for i := range uis {
|
||||
ui := &uis[i]
|
||||
if m[ui.Username] != nil {
|
||||
if ui.BearerToken == "" && ui.Username == "" {
|
||||
return nil, fmt.Errorf("either bearer_token or username must be set")
|
||||
}
|
||||
if ui.BearerToken != "" && ui.Username != "" {
|
||||
return nil, fmt.Errorf("bearer_token=%q and username=%q cannot be set simultaneously", ui.BearerToken, ui.Username)
|
||||
}
|
||||
if byBearerToken[ui.BearerToken] {
|
||||
return nil, fmt.Errorf("duplicate bearer_token found; bearer_token: %q", ui.BearerToken)
|
||||
}
|
||||
if byUsername[ui.Username] {
|
||||
return nil, fmt.Errorf("duplicate username found; username: %q", ui.Username)
|
||||
}
|
||||
if len(ui.URLPrefix) > 0 {
|
||||
urlPrefix, err := sanitizeURLPrefix(ui.URLPrefix)
|
||||
if err != nil {
|
||||
authToken := getAuthToken(ui.BearerToken, ui.Username, ui.Password)
|
||||
if byAuthToken[authToken] != nil {
|
||||
return nil, fmt.Errorf("duplicate auth token found for bearer_token=%q, username=%q: %q", authToken, ui.BearerToken, ui.Username)
|
||||
}
|
||||
if ui.URLPrefix != nil {
|
||||
if err := ui.URLPrefix.sanitize(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ui.URLPrefix = urlPrefix
|
||||
}
|
||||
for _, e := range ui.URLMap {
|
||||
if len(e.SrcPaths) == 0 {
|
||||
return nil, fmt.Errorf("missing `src_paths`")
|
||||
return nil, fmt.Errorf("missing `src_paths` in `url_map`")
|
||||
}
|
||||
for _, path := range e.SrcPaths {
|
||||
if !strings.HasPrefix(path, "/") {
|
||||
return nil, fmt.Errorf("`src_path`=%q must start with `/`", path)
|
||||
}
|
||||
if e.URLPrefix == nil {
|
||||
return nil, fmt.Errorf("missing `url_prefix` in `url_map`")
|
||||
}
|
||||
urlPrefix, err := sanitizeURLPrefix(e.URLPrefix)
|
||||
if err != nil {
|
||||
if err := e.URLPrefix.sanitize(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
e.URLPrefix = urlPrefix
|
||||
}
|
||||
if len(ui.URLMap) == 0 && len(ui.URLPrefix) == 0 {
|
||||
if len(ui.URLMap) == 0 && ui.URLPrefix == nil {
|
||||
return nil, fmt.Errorf("missing `url_prefix`")
|
||||
}
|
||||
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
|
||||
m[ui.Username] = ui
|
||||
if ui.BearerToken != "" {
|
||||
if ui.Password != "" {
|
||||
return nil, fmt.Errorf("password shouldn't be set for bearer_token %q", ui.BearerToken)
|
||||
}
|
||||
ui.requests = metrics.GetOrCreateCounter(`vmauth_user_requests_total{username="bearer_token"}`)
|
||||
byBearerToken[ui.BearerToken] = true
|
||||
}
|
||||
if ui.Username != "" {
|
||||
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
|
||||
byUsername[ui.Username] = true
|
||||
}
|
||||
byAuthToken[authToken] = ui
|
||||
}
|
||||
return m, nil
|
||||
return byAuthToken, nil
|
||||
}
|
||||
|
||||
func sanitizeURLPrefix(urlPrefix string) (string, error) {
|
||||
func getAuthToken(bearerToken, username, password string) string {
|
||||
if bearerToken != "" {
|
||||
return "Bearer " + bearerToken
|
||||
}
|
||||
token := username + ":" + password
|
||||
token64 := base64.StdEncoding.EncodeToString([]byte(token))
|
||||
return "Basic " + token64
|
||||
}
|
||||
|
||||
func (up *URLPrefix) sanitize() error {
|
||||
for i, pu := range up.urls {
|
||||
puNew, err := sanitizeURLPrefix(pu)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
up.urls[i] = puNew
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func sanitizeURLPrefix(urlPrefix *url.URL) (*url.URL, error) {
|
||||
// Remove trailing '/' from urlPrefix
|
||||
for strings.HasSuffix(urlPrefix, "/") {
|
||||
urlPrefix = urlPrefix[:len(urlPrefix)-1]
|
||||
for strings.HasSuffix(urlPrefix.Path, "/") {
|
||||
urlPrefix.Path = urlPrefix.Path[:len(urlPrefix.Path)-1]
|
||||
}
|
||||
// Validate urlPrefix
|
||||
target, err := url.Parse(urlPrefix)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
|
||||
if urlPrefix.Scheme != "http" && urlPrefix.Scheme != "https" {
|
||||
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, urlPrefix.Scheme)
|
||||
}
|
||||
if target.Scheme != "http" && target.Scheme != "https" {
|
||||
return "", fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
|
||||
}
|
||||
if target.Host == "" {
|
||||
return "", fmt.Errorf("missing hostname in `url_prefix %q`", urlPrefix)
|
||||
if urlPrefix.Host == "" {
|
||||
return nil, fmt.Errorf("missing hostname in `url_prefix %q`", urlPrefix.Host)
|
||||
}
|
||||
return urlPrefix, nil
|
||||
}
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
func TestParseAuthConfigFailure(t *testing.T) {
|
||||
@@ -51,6 +56,41 @@ users:
|
||||
- username: foo
|
||||
url_prefix: http:///bar
|
||||
`)
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix:
|
||||
bar: baz
|
||||
`)
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix:
|
||||
- [foo]
|
||||
`)
|
||||
|
||||
// empty url_prefix
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
url_prefix: []
|
||||
`)
|
||||
|
||||
// Username and bearer_token in a single config
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
bearer_token: bbb
|
||||
url_prefix: http://foo.bar
|
||||
`)
|
||||
|
||||
// Bearer_token and password in a single config
|
||||
f(`
|
||||
users:
|
||||
- password: foo
|
||||
bearer_token: bbb
|
||||
url_prefix: http://foo.bar
|
||||
`)
|
||||
|
||||
// Duplicate users
|
||||
f(`
|
||||
@@ -63,6 +103,17 @@ users:
|
||||
url_prefix: https://sss.sss
|
||||
`)
|
||||
|
||||
// Duplicate bearer_tokens
|
||||
f(`
|
||||
users:
|
||||
- bearer_token: foo
|
||||
url_prefix: http://foo.bar
|
||||
- username: bar
|
||||
url_prefix: http://xxx.yyy
|
||||
- bearer_token: foo
|
||||
url_prefix: https://sss.sss
|
||||
`)
|
||||
|
||||
// Missing url_prefix in url_map
|
||||
f(`
|
||||
users:
|
||||
@@ -71,6 +122,24 @@ users:
|
||||
- src_paths: ["/foo/bar"]
|
||||
`)
|
||||
|
||||
// Invalid url_prefix in url_map
|
||||
f(`
|
||||
users:
|
||||
- username: a
|
||||
url_map:
|
||||
- src_paths: ["/foo/bar"]
|
||||
url_prefix: foo.bar
|
||||
`)
|
||||
|
||||
// empty url_prefix in url_map
|
||||
f(`
|
||||
users:
|
||||
- username: a
|
||||
url_map:
|
||||
- src_paths: ['/foo/bar']
|
||||
url_prefix: []
|
||||
`)
|
||||
|
||||
// Missing src_paths in url_map
|
||||
f(`
|
||||
users:
|
||||
@@ -79,12 +148,12 @@ users:
|
||||
- url_prefix: http://foobar
|
||||
`)
|
||||
|
||||
// src_path not starting with `/`
|
||||
// Invalid regexp in src_path.
|
||||
f(`
|
||||
users:
|
||||
- username: a
|
||||
url_map:
|
||||
- src_paths: [foobar]
|
||||
- src_paths: ['fo[obar']
|
||||
url_prefix: http://foobar
|
||||
`)
|
||||
}
|
||||
@@ -97,8 +166,8 @@ func TestParseAuthConfigSuccess(t *testing.T) {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
removeMetrics(m)
|
||||
if !reflect.DeepEqual(m, expectedAuthConfig) {
|
||||
t.Fatalf("unexpected auth config\ngot\n%v\nwant\n%v", m, expectedAuthConfig)
|
||||
if err := areEqualConfigs(m, expectedAuthConfig); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,10 +178,29 @@ users:
|
||||
password: bar
|
||||
url_prefix: http://aaa:343/bbb
|
||||
`, map[string]*UserInfo{
|
||||
"foo": {
|
||||
getAuthToken("", "foo", "bar"): {
|
||||
Username: "foo",
|
||||
Password: "bar",
|
||||
URLPrefix: "http://aaa:343/bbb",
|
||||
URLPrefix: mustParseURL("http://aaa:343/bbb"),
|
||||
},
|
||||
})
|
||||
|
||||
// Multiple url_prefix entries
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
password: bar
|
||||
url_prefix:
|
||||
- http://node1:343/bbb
|
||||
- http://node2:343/bbb
|
||||
`, map[string]*UserInfo{
|
||||
getAuthToken("", "foo", "bar"): {
|
||||
Username: "foo",
|
||||
Password: "bar",
|
||||
URLPrefix: mustParseURLs([]string{
|
||||
"http://node1:343/bbb",
|
||||
"http://node2:343/bbb",
|
||||
}),
|
||||
},
|
||||
})
|
||||
|
||||
@@ -124,44 +212,91 @@ users:
|
||||
- username: bar
|
||||
url_prefix: https://bar/x///
|
||||
`, map[string]*UserInfo{
|
||||
"foo": {
|
||||
getAuthToken("", "foo", ""): {
|
||||
Username: "foo",
|
||||
URLPrefix: "http://foo",
|
||||
URLPrefix: mustParseURL("http://foo"),
|
||||
},
|
||||
"bar": {
|
||||
getAuthToken("", "bar", ""): {
|
||||
Username: "bar",
|
||||
URLPrefix: "https://bar/x",
|
||||
URLPrefix: mustParseURL("https://bar/x"),
|
||||
},
|
||||
})
|
||||
|
||||
// non-empty URLMap
|
||||
f(`
|
||||
users:
|
||||
- username: foo
|
||||
- bearer_token: foo
|
||||
url_map:
|
||||
- src_paths: ["/api/v1/query","/api/v1/query_range"]
|
||||
- src_paths: ["/api/v1/query","/api/v1/query_range","/api/v1/label/[^./]+/.+"]
|
||||
url_prefix: http://vmselect/select/0/prometheus
|
||||
- src_paths: ["/api/v1/write"]
|
||||
url_prefix: http://vminsert/insert/0/prometheus
|
||||
url_prefix: ["http://vminsert1/insert/0/prometheus","http://vminsert2/insert/0/prometheus"]
|
||||
`, map[string]*UserInfo{
|
||||
"foo": {
|
||||
Username: "foo",
|
||||
getAuthToken("foo", "", ""): {
|
||||
BearerToken: "foo",
|
||||
URLMap: []URLMap{
|
||||
{
|
||||
SrcPaths: []string{"/api/v1/query", "/api/v1/query_range"},
|
||||
URLPrefix: "http://vmselect/select/0/prometheus",
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/query", "/api/v1/query_range", "/api/v1/label/[^./]+/.+"}),
|
||||
URLPrefix: mustParseURL("http://vmselect/select/0/prometheus"),
|
||||
},
|
||||
{
|
||||
SrcPaths: []string{"/api/v1/write"},
|
||||
URLPrefix: "http://vminsert/insert/0/prometheus",
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
|
||||
URLPrefix: mustParseURLs([]string{
|
||||
"http://vminsert1/insert/0/prometheus",
|
||||
"http://vminsert2/insert/0/prometheus",
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func getSrcPaths(paths []string) []*SrcPath {
|
||||
var sps []*SrcPath
|
||||
for _, path := range paths {
|
||||
sps = append(sps, &SrcPath{
|
||||
sOriginal: path,
|
||||
re: regexp.MustCompile("^(?:" + path + ")$"),
|
||||
})
|
||||
}
|
||||
return sps
|
||||
}
|
||||
|
||||
func removeMetrics(m map[string]*UserInfo) {
|
||||
for _, info := range m {
|
||||
info.requests = nil
|
||||
}
|
||||
}
|
||||
|
||||
func areEqualConfigs(a, b map[string]*UserInfo) error {
|
||||
aData, err := yaml.Marshal(a)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot marshal a: %w", err)
|
||||
}
|
||||
bData, err := yaml.Marshal(b)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot marshal b: %w", err)
|
||||
}
|
||||
if !bytes.Equal(aData, bData) {
|
||||
return fmt.Errorf("unexpected configs;\ngot\n%s\nwant\n%s", aData, bData)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func mustParseURL(u string) *URLPrefix {
|
||||
return mustParseURLs([]string{u})
|
||||
}
|
||||
|
||||
func mustParseURLs(us []string) *URLPrefix {
|
||||
pus := make([]*url.URL, len(us))
|
||||
for i, u := range us {
|
||||
pu, err := url.Parse(u)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("BUG: cannot parse %q: %w", u, err))
|
||||
}
|
||||
pus[i] = pu
|
||||
}
|
||||
return &URLPrefix{
|
||||
urls: pus,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,30 +2,70 @@
|
||||
# Usernames must be unique.
|
||||
|
||||
users:
|
||||
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
- bearer_token: "XXXX"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying local single-node VictoriaMetrics.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
|
||||
# will be proxied to http://localhost:8428 .
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
|
||||
# The user for querying local single-node VictoriaMetrics with extra_label team=dev.
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vmselect:8481/select/123/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
|
||||
# will be routed to http://localhost:8428 with extra_label=team=dev query arg.
|
||||
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query?extra_label=team=dev
|
||||
- username: "local-single-node"
|
||||
password: "***"
|
||||
url_prefix: "http://localhost:8428?extra_label=team=dev"
|
||||
|
||||
# The user for querying account 123 in VictoriaMetrics cluster
|
||||
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be load-balanced among http://vmselect1:8481/select/123/prometheus and http://vmselect2:8481/select/123/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to the following urls in a round-robin manner:
|
||||
# - http://vmselect1:8481/select/123/prometheus/api/v1/select
|
||||
# - http://vmselect2:8481/select/123/prometheus/api/v1/select
|
||||
- username: "cluster-select-account-123"
|
||||
password: "***"
|
||||
url_prefix: "http://vmselect:8481/select/123/prometheus"
|
||||
url_prefix:
|
||||
- "http://vmselect1:8481/select/123/prometheus"
|
||||
- "http://vmselect2:8481/select/123/prometheus"
|
||||
|
||||
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
|
||||
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
|
||||
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be routed to http://vminsert:8480/insert/42/prometheus .
|
||||
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
|
||||
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
|
||||
# will be load-balanced between http://vminsert1:8480/insert/42/prometheus and http://vminsert2:8480/insert/42/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/write is proxied to the following urls in a round-robin manner:
|
||||
# - http://vminsert1:8480/insert/42/prometheus/api/v1/write
|
||||
# - http://vminsert2:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "cluster-insert-account-42"
|
||||
password: "***"
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
url_prefix:
|
||||
- "http://vminsert1:8480/insert/42/prometheus"
|
||||
- "http://vminsert2:8480/insert/42/prometheus"
|
||||
|
||||
|
||||
# A single user for querying and inserting data:
|
||||
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
|
||||
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
|
||||
# - http://vmselect1:8481/select/42/prometheus
|
||||
# - http://vmselect2:8481/select/42/prometheus
|
||||
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
|
||||
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
|
||||
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write
|
||||
- username: "foobar"
|
||||
url_map:
|
||||
- src_paths:
|
||||
- "/api/v1/query"
|
||||
- "/api/v1/query_range"
|
||||
- "/api/v1/label/[^/]+/values"
|
||||
url_prefix:
|
||||
- "http://vmselect1:8481/select/42/prometheus"
|
||||
- "http://vmselect2:8481/select/42/prometheus"
|
||||
- src_paths: ["/api/v1/write"]
|
||||
url_prefix: "http://vminsert:8480/insert/42/prometheus"
|
||||
|
||||
@@ -14,10 +14,13 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
|
||||
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
|
||||
maxIdleConnsPerBackend = flag.Int("maxIdleConnsPerBackend", 100, "The maximum number of idle connections vmauth can open per each backend host")
|
||||
reloadAuthKey = flag.String("reloadAuthKey", "", "Auth key for /-/reload http endpoint. It must be passed as authKey=...")
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -47,16 +50,28 @@ func main() {
|
||||
}
|
||||
|
||||
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
username, password, ok := r.BasicAuth()
|
||||
if !ok {
|
||||
switch r.URL.Path {
|
||||
case "/-/reload":
|
||||
authKey := r.FormValue("authKey")
|
||||
if authKey != *reloadAuthKey {
|
||||
httpserver.Errorf(w, r, "invalid authKey %q. It must match the value from -reloadAuthKey command line flag", authKey)
|
||||
return true
|
||||
}
|
||||
configReloadRequests.Inc()
|
||||
procutil.SelfSIGHUP()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return true
|
||||
}
|
||||
authToken := r.Header.Get("Authorization")
|
||||
if authToken == "" {
|
||||
w.Header().Set("WWW-Authenticate", `Basic realm="Restricted"`)
|
||||
http.Error(w, "missing `Authorization: Basic *` header", http.StatusUnauthorized)
|
||||
http.Error(w, "missing `Authorization` request header", http.StatusUnauthorized)
|
||||
return true
|
||||
}
|
||||
ac := authConfig.Load().(map[string]*UserInfo)
|
||||
ui := ac[username]
|
||||
if ui == nil || ui.Password != password {
|
||||
httpserver.Errorf(w, r, "cannot find the provided username %q or password in config", username)
|
||||
ui := ac[authToken]
|
||||
if ui == nil {
|
||||
httpserver.Errorf(w, r, "cannot find the provided auth token %q in config", authToken)
|
||||
return true
|
||||
}
|
||||
ui.requests.Inc()
|
||||
@@ -65,15 +80,27 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
httpserver.Errorf(w, r, "cannot determine targetURL: %s", err)
|
||||
return true
|
||||
}
|
||||
if _, err := url.Parse(targetURL); err != nil {
|
||||
httpserver.Errorf(w, r, "invalid targetURL=%q: %s", targetURL, err)
|
||||
return true
|
||||
}
|
||||
r.Header.Set("vm-target-url", targetURL)
|
||||
reverseProxy.ServeHTTP(w, r)
|
||||
r.Header.Set("vm-target-url", targetURL.String())
|
||||
proxyRequest(w, r)
|
||||
return true
|
||||
}
|
||||
|
||||
func proxyRequest(w http.ResponseWriter, r *http.Request) {
|
||||
defer func() {
|
||||
err := recover()
|
||||
if err == nil || err == http.ErrAbortHandler {
|
||||
// Suppress http.ErrAbortHandler panic.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1353
|
||||
return
|
||||
}
|
||||
// Forward other panics to the caller.
|
||||
panic(err)
|
||||
}()
|
||||
reverseProxy.ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
var configReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
|
||||
var reverseProxy = &httputil.ReverseProxy{
|
||||
Director: func(r *http.Request) {
|
||||
targetURL := r.Header.Get("vm-target-url")
|
||||
@@ -89,6 +116,7 @@ var reverseProxy = &httputil.ReverseProxy{
|
||||
tr.DisableCompression = true
|
||||
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
|
||||
tr.ForceAttemptHTTP2 = false
|
||||
tr.MaxIdleConnsPerHost = *maxIdleConnsPerBackend
|
||||
return tr
|
||||
}(),
|
||||
FlushInterval: time.Second,
|
||||
@@ -99,7 +127,7 @@ func usage() {
|
||||
const s = `
|
||||
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vmauth.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmauth.html .
|
||||
`
|
||||
flagutil.Usage(s)
|
||||
}
|
||||
|
||||
@@ -7,25 +7,50 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func createTargetURL(ui *UserInfo, uOrig *url.URL) (string, error) {
|
||||
u, err := url.Parse(uOrig.String())
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("cannot make a copy of %q: %w", u, err)
|
||||
func (up *URLPrefix) mergeURLs(requestURI *url.URL) *url.URL {
|
||||
pu := up.getNextURL()
|
||||
return mergeURLs(pu, requestURI)
|
||||
}
|
||||
|
||||
func mergeURLs(uiURL, requestURI *url.URL) *url.URL {
|
||||
targetURL := *uiURL
|
||||
targetURL.Path += requestURI.Path
|
||||
requestParams := requestURI.Query()
|
||||
// fast path
|
||||
if len(requestParams) == 0 {
|
||||
return &targetURL
|
||||
}
|
||||
// merge query parameters from requests.
|
||||
uiParams := targetURL.Query()
|
||||
for k, v := range requestParams {
|
||||
// skip clashed query params from original request
|
||||
if exist := uiParams.Get(k); len(exist) > 0 {
|
||||
continue
|
||||
}
|
||||
for i := range v {
|
||||
uiParams.Add(k, v[i])
|
||||
}
|
||||
}
|
||||
targetURL.RawQuery = uiParams.Encode()
|
||||
return &targetURL
|
||||
}
|
||||
|
||||
func createTargetURL(ui *UserInfo, uOrig *url.URL) (*url.URL, error) {
|
||||
u := *uOrig
|
||||
// Prevent from attacks with using `..` in r.URL.Path
|
||||
u.Path = path.Clean(u.Path)
|
||||
if !strings.HasPrefix(u.Path, "/") {
|
||||
u.Path = "/" + u.Path
|
||||
}
|
||||
for _, e := range ui.URLMap {
|
||||
for _, path := range e.SrcPaths {
|
||||
if u.Path == path {
|
||||
return e.URLPrefix + u.RequestURI(), nil
|
||||
for _, sp := range e.SrcPaths {
|
||||
if sp.match(u.Path) {
|
||||
return e.URLPrefix.mergeURLs(&u), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(ui.URLPrefix) > 0 {
|
||||
return ui.URLPrefix + u.RequestURI(), nil
|
||||
if ui.URLPrefix != nil {
|
||||
return ui.URLPrefix.mergeURLs(&u), nil
|
||||
}
|
||||
return "", fmt.Errorf("missing route for %q", u)
|
||||
return nil, fmt.Errorf("missing route for %q", u.String())
|
||||
}
|
||||
|
||||
@@ -16,47 +16,74 @@ func TestCreateTargetURLSuccess(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
if target != expectedTarget {
|
||||
if target.String() != expectedTarget {
|
||||
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
|
||||
}
|
||||
}
|
||||
// Simple routing with `url_prefix`
|
||||
f(&UserInfo{
|
||||
URLPrefix: "http://foo.bar",
|
||||
URLPrefix: mustParseURL("http://foo.bar"),
|
||||
}, "", "http://foo.bar/.")
|
||||
f(&UserInfo{
|
||||
URLPrefix: "http://foo.bar",
|
||||
URLPrefix: mustParseURL("http://foo.bar"),
|
||||
}, "/", "http://foo.bar/")
|
||||
f(&UserInfo{
|
||||
URLPrefix: "http://foo.bar",
|
||||
URLPrefix: mustParseURL("http://foo.bar"),
|
||||
}, "a/b?c=d", "http://foo.bar/a/b?c=d")
|
||||
f(&UserInfo{
|
||||
URLPrefix: "https://sss:3894/x/y",
|
||||
URLPrefix: mustParseURL("https://sss:3894/x/y"),
|
||||
}, "/z", "https://sss:3894/x/y/z")
|
||||
f(&UserInfo{
|
||||
URLPrefix: "https://sss:3894/x/y",
|
||||
URLPrefix: mustParseURL("https://sss:3894/x/y"),
|
||||
}, "/../../aaa", "https://sss:3894/x/y/aaa")
|
||||
f(&UserInfo{
|
||||
URLPrefix: "https://sss:3894/x/y",
|
||||
}, "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
|
||||
URLPrefix: mustParseURL("https://sss:3894/x/y"),
|
||||
}, "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s%2F..%2Fd")
|
||||
|
||||
// Complex routing with `url_map`
|
||||
ui := &UserInfo{
|
||||
URLMap: []URLMap{
|
||||
{
|
||||
SrcPaths: []string{"/api/v1/query"},
|
||||
URLPrefix: "http://vmselect/0/prometheus",
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/query"}),
|
||||
URLPrefix: mustParseURL("http://vmselect/0/prometheus"),
|
||||
},
|
||||
{
|
||||
SrcPaths: []string{"/api/v1/write"},
|
||||
URLPrefix: "http://vminsert/0/prometheus",
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
|
||||
URLPrefix: mustParseURL("http://vminsert/0/prometheus"),
|
||||
},
|
||||
},
|
||||
URLPrefix: "http://default-server",
|
||||
URLPrefix: mustParseURL("http://default-server"),
|
||||
}
|
||||
f(ui, "/api/v1/query?query=up", "http://vmselect/0/prometheus/api/v1/query?query=up")
|
||||
f(ui, "/api/v1/write", "http://vminsert/0/prometheus/api/v1/write")
|
||||
f(ui, "/api/v1/query_range", "http://default-server/api/v1/query_range")
|
||||
|
||||
// Complex routing regexp paths in `url_map`
|
||||
ui = &UserInfo{
|
||||
URLMap: []URLMap{
|
||||
{
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/query(_range)?", "/api/v1/label/[^/]+/values"}),
|
||||
URLPrefix: mustParseURL("http://vmselect/0/prometheus"),
|
||||
},
|
||||
{
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
|
||||
URLPrefix: mustParseURL("http://vminsert/0/prometheus"),
|
||||
},
|
||||
},
|
||||
URLPrefix: mustParseURL("http://default-server"),
|
||||
}
|
||||
f(ui, "/api/v1/query?query=up", "http://vmselect/0/prometheus/api/v1/query?query=up")
|
||||
f(ui, "/api/v1/query_range?query=up", "http://vmselect/0/prometheus/api/v1/query_range?query=up")
|
||||
f(ui, "/api/v1/label/foo/values", "http://vmselect/0/prometheus/api/v1/label/foo/values")
|
||||
f(ui, "/api/v1/write", "http://vminsert/0/prometheus/api/v1/write")
|
||||
f(ui, "/api/v1/foo/bar", "http://default-server/api/v1/foo/bar")
|
||||
f(&UserInfo{
|
||||
URLPrefix: mustParseURL("http://foo.bar?extra_label=team=dev"),
|
||||
}, "/api/v1/query", "http://foo.bar/api/v1/query?extra_label=team=dev")
|
||||
f(&UserInfo{
|
||||
URLPrefix: mustParseURL("http://foo.bar?extra_label=team=mobile"),
|
||||
}, "/api/v1/query?extra_label=team=dev", "http://foo.bar/api/v1/query?extra_label=team%3Dmobile")
|
||||
|
||||
}
|
||||
|
||||
func TestCreateTargetURLFailure(t *testing.T) {
|
||||
@@ -70,7 +97,7 @@ func TestCreateTargetURLFailure(t *testing.T) {
|
||||
if err == nil {
|
||||
t.Fatalf("expecting non-nil error")
|
||||
}
|
||||
if target != "" {
|
||||
if target != nil {
|
||||
t.Fatalf("unexpected target=%q; want empty string", target)
|
||||
}
|
||||
}
|
||||
@@ -78,8 +105,8 @@ func TestCreateTargetURLFailure(t *testing.T) {
|
||||
f(&UserInfo{
|
||||
URLMap: []URLMap{
|
||||
{
|
||||
SrcPaths: []string{"/api/v1/query"},
|
||||
URLPrefix: "http://foobar/baz",
|
||||
SrcPaths: getSrcPaths([]string{"/api/v1/query"}),
|
||||
URLPrefix: mustParseURL("http://foobar/baz"),
|
||||
},
|
||||
},
|
||||
}, "/api/v1/write")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
## vmbackup
|
||||
# vmbackup
|
||||
|
||||
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
|
||||
Supported storage systems for backups:
|
||||
|
||||
@@ -15,7 +15,7 @@ data between the existing backup and new backup. It saves time and costs on data
|
||||
|
||||
Backup process can be interrupted at any time. It is automatically resumed from the interruption point when restarting `vmbackup` with the same args.
|
||||
|
||||
Backed up data can be restored with [vmrestore](https://victoriametrics.github.io/vmrestore.html).
|
||||
Backed up data can be restored with [vmrestore](https://docs.victoriametrics.com/vmrestore.html).
|
||||
|
||||
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
|
||||
|
||||
@@ -34,8 +34,8 @@ vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-
|
||||
```
|
||||
|
||||
* `</path/to/victoria-metrics-data>` - path to VictoriaMetrics data pointed by `-storageDataPath` command-line flag in single-node VictoriaMetrics or in cluster `vmstorage`.
|
||||
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots). `vmbackup` can create the snapshot on itself if `-snapshot.createURL` command-line flag is set to an url for creating snapshots. In this case `-snapshotName` flag isn't needed.
|
||||
* `<bucket>` is an already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
|
||||
* `<path/to/new/backup>` is the destination path where new backup will be placed.
|
||||
|
||||
@@ -72,7 +72,7 @@ Smart backups mean storing full daily backups into `YYYYMMDD` folders and creati
|
||||
vmbackup -snapshotName=<latest-snapshot> -dst=gcs://<bucket>/latest
|
||||
```
|
||||
|
||||
Where `<latest-snapshot>` is the latest [snapshot](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
Where `<latest-snapshot>` is the latest [snapshot](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
|
||||
The command will upload only changed data to `gcs://<bucket>/latest`.
|
||||
|
||||
* Run the following command once a day:
|
||||
@@ -123,8 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
|
||||
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
|
||||
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
|
||||
* Backups created from [single-node VictoriaMetrics](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html) and vice versa.
|
||||
* Backups created from [single-node VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html) cannot be restored
|
||||
at [cluster VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) and vice versa.
|
||||
|
||||
|
||||
## Advanced usage
|
||||
@@ -194,7 +194,7 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
@@ -204,23 +204,23 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
|
||||
-maxBytesPerSecond value
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-maxBytesPerSecond size
|
||||
The maximum upload speed. There is no limit if it is set to 0
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedBytes value
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
|
||||
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-origin string
|
||||
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
|
||||
-snapshot.createURL string
|
||||
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
|
||||
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
-snapshot.deleteURL string
|
||||
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete
|
||||
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete
|
||||
-snapshotName string
|
||||
Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots
|
||||
Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set
|
||||
-storageDataPath string
|
||||
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
|
||||
-version
|
||||
@@ -235,14 +235,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmbackup` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmbackup` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmbackup` binary and puts it into the `bin` folder.
|
||||
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmbackup-prod` from the root folder of the repository.
|
||||
2. Run `make vmbackup-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmbackup-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
### Building docker images
|
||||
|
||||
@@ -19,11 +19,11 @@ import (
|
||||
|
||||
var (
|
||||
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
|
||||
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots")
|
||||
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set")
|
||||
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. "+
|
||||
"Example: http://victoriametrics:8428/snaphsot/create")
|
||||
"Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set")
|
||||
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. "+
|
||||
"All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete")
|
||||
"All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete")
|
||||
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
|
||||
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
|
||||
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
|
||||
@@ -41,7 +41,9 @@ func main() {
|
||||
logger.Init()
|
||||
|
||||
if len(*snapshotCreateURL) > 0 {
|
||||
logger.Infof("Snapshots enabled")
|
||||
if len(*snapshotName) > 0 {
|
||||
logger.Fatalf("-snapshotName shouldn't be set if -snapshot.createURL is set, since snapshots are created automatically in this case")
|
||||
}
|
||||
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
|
||||
if len(*snapshotDeleteURL) <= 0 {
|
||||
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
|
||||
@@ -99,7 +101,7 @@ func usage() {
|
||||
vmbackup performs backups for VictoriaMetrics data from instant snapshots to gcs, s3
|
||||
or local filesystem. Backed up data can be restored with vmrestore.
|
||||
|
||||
See the docs at https://victoriametrics.github.io/vbackup.html .
|
||||
See the docs at https://docs.victoriametrics.com/vmbackup.html .
|
||||
`
|
||||
flagutil.Usage(s)
|
||||
}
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
## Victoria Metrics Backup Manager
|
||||
## vmbackupmanager
|
||||
|
||||
This service automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed.
|
||||
***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/enterprise.html)***
|
||||
|
||||
The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed.
|
||||
The backup service makes a backup every hour and puts it to the latest folder and then copies data to the folders which represent the backup intervals (hourly, daily, weekly and monthly)
|
||||
|
||||
The required flags for running the service are as follows:
|
||||
|
||||
* -eula - should be true and means that you have the legal right to run a backup manager. That can either be a signed contract or an email with confirmation to run the service in a trial period
|
||||
* -storageDataPath - path to VictoriaMetrics or vmstorage data path to make backup from
|
||||
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
|
||||
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create
|
||||
* -dst - backup destination at s3, gcs or local filesystem
|
||||
* -credsFilePath - path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set. See [https://cloud.google.com/iam/docs/creating-managing-service-account-keys](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) and [https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html](https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html)
|
||||
|
||||
@@ -47,7 +49,7 @@ There are two flags which could help with performance tuning:
|
||||
* -concurrency - The number of concurrent workers. Higher concurrency may improve upload speed (default 10)
|
||||
|
||||
|
||||
### Example of Usage
|
||||
## Example of Usage
|
||||
|
||||
GCS and cluster version. You need to have a credentials file in json format with following structure
|
||||
|
||||
|
||||
@@ -71,3 +71,9 @@ vmctl-local-with-goarch:
|
||||
|
||||
vmctl-pure:
|
||||
APP_NAME=vmctl $(MAKE) app-local-pure
|
||||
|
||||
vmctl-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmctl $(MAKE) app-local-windows-with-goarch
|
||||
|
||||
vmctl-windows-amd64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
@@ -1,41 +1,15 @@
|
||||
# vmctl
|
||||
|
||||
Victoria metrics command-line tool
|
||||
VictoriaMetrics command-line tool
|
||||
|
||||
Features:
|
||||
- [x] Prometheus: migrate data from Prometheus to VictoriaMetrics using snapshot API
|
||||
- [x] Thanos: migrate data from Thanos to VictoriaMetrics
|
||||
- [ ] ~~Prometheus: migrate data from Prometheus to VictoriaMetrics by query~~(discarded)
|
||||
- [x] InfluxDB: migrate data from InfluxDB to VictoriaMetrics
|
||||
- [x] OpenTSDB: migrate data from OpenTSDB to VictoriaMetrics
|
||||
- [ ] Storage Management: data re-balancing between nodes
|
||||
|
||||
# Table of contents
|
||||
|
||||
* [Articles](#articles)
|
||||
* [How to build](#how-to-build)
|
||||
* [Migrating data from InfluxDB 1.x](#migrating-data-from-influxdb-1x)
|
||||
* [Data mapping](#data-mapping)
|
||||
* [Configuration](#configuration)
|
||||
* [Filtering](#filtering)
|
||||
* [Migrating data from InfluxDB 2.x](#migrating-data-from-influxdb-2x)
|
||||
* [Migrating data from Prometheus](#migrating-data-from-prometheus)
|
||||
* [Data mapping](#data-mapping-1)
|
||||
* [Configuration](#configuration-1)
|
||||
* [Filtering](#filtering-1)
|
||||
* [Migrating data from Thanos](#migrating-data-from-thanos)
|
||||
* [Current data](#current-data)
|
||||
* [Historical data](#historical-data)
|
||||
* [Migrating data from VictoriaMetrics](#migrating-data-from-victoriametrics)
|
||||
* [Native protocol](#native-protocol)
|
||||
* [Tuning](#tuning)
|
||||
* [Influx mode](#influx-mode)
|
||||
* [Prometheus mode](#prometheus-mode)
|
||||
* [VictoriaMetrics importer](#victoriametrics-importer)
|
||||
* [Importer stats](#importer-stats)
|
||||
* [Significant figures](#significant-figures)
|
||||
* [Adding extra labels](#adding-extra-labels)
|
||||
|
||||
|
||||
## Articles
|
||||
|
||||
* [How to migrate data from Prometheus](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043)
|
||||
@@ -48,14 +22,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
|
||||
|
||||
### Development build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmctl` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmctl` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl` binary and puts it into the `bin` folder.
|
||||
|
||||
### Production build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmctl-prod` from the root folder of the repository.
|
||||
2. Run `make vmctl-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl-prod` binary and puts it into the `bin` folder.
|
||||
|
||||
### Building docker images
|
||||
@@ -77,16 +51,121 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
|
||||
|
||||
#### Development ARM build
|
||||
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
|
||||
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of the repository.
|
||||
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
|
||||
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl-arm` or `vmctl-arm64` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
#### Production ARM build
|
||||
|
||||
1. [Install docker](https://docs.docker.com/install/).
|
||||
2. Run `make vmctl-arm-prod` or `make vmctl-arm64-prod` from the root folder of the repository.
|
||||
2. Run `make vmctl-arm-prod` or `make vmctl-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
|
||||
It builds `vmctl-arm-prod` or `vmctl-arm64-prod` binary respectively and puts it into the `bin` folder.
|
||||
|
||||
## Migrating data from OpenTSDB
|
||||
|
||||
`vmctl` supports the `opentsdb` mode to migrate data from OpenTSDB to VictoriaMetrics time-series database.
|
||||
|
||||
See `./vmctl opentsdb --help` for details and full list of flags.
|
||||
|
||||
*OpenTSDB migration is not possible without a functioning [meta](http://opentsdb.net/docs/build/html/user_guide/metadata.html) table to search for metrics/series.*
|
||||
|
||||
OpenTSDB migration works like so:
|
||||
|
||||
1. Find metrics based on selected filters (or the default filter set ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
|
||||
* e.g. `curl -Ss "http://opentsdb:4242/api/suggest?type=metrics&q=sys"`
|
||||
2. Find series associated with each returned metric
|
||||
* e.g. `curl -Ss "http://opentsdb:4242/api/search/lookup?m=system.load5&limit=1000000"`
|
||||
3. Download data for each series in chunks defined in the CLI switches
|
||||
* e.g. `-retention=sum-1m-avg:1h:90d` ==
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
* ...
|
||||
* `curl -Ss "http://opentsdb:4242/api/query?start=2160h-ago&end=2159h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
|
||||
|
||||
This means that we must stream data from OpenTSDB to VictoriaMetrics in chunks. This is where concurrency for OpenTSDB comes in. We can query multiple chunks at once, but we shouldn't perform too many chunks at a time to avoid overloading the OpenTSDB cluster.
|
||||
|
||||
```
|
||||
$ bin/vmctl opentsdb --otsdb-addr http://opentsdb:4242/ --otsdb-retentions sum-1m-avg:1h:1d --otsdb-filters system --otsdb-normalize --vm-addr http://victoria/
|
||||
OpenTSDB import mode
|
||||
2021/04/09 11:52:50 Will collect data starting at TS 1617990770
|
||||
2021/04/09 11:52:50 Loading all metrics from OpenTSDB for filters: [system]
|
||||
Found 9 metrics to import. Continue? [Y/n]
|
||||
2021/04/09 11:52:51 Starting work on system.load1
|
||||
23 / 402200 [>____________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________] 0.01% 2 p/s
|
||||
```
|
||||
|
||||
### Retention strings
|
||||
|
||||
Starting with a relatively simple retention string (`sum-1m-avg:1h:30d`), let's describe how this is converted into actual queries.
|
||||
|
||||
There are two essential parts of a retention string:
|
||||
1. [aggregation](#aggregation)
|
||||
2. [windows/time ranges](#windows)
|
||||
|
||||
#### Aggregation
|
||||
|
||||
Retention strings essentially define the two levels of aggregation for our collected series.
|
||||
|
||||
`sum-1m-avg` would become:
|
||||
* First order: `sum`
|
||||
* Second order: `1m-avg-none`
|
||||
|
||||
##### First Order Aggregations
|
||||
|
||||
First-order aggregation addresses how to aggregate any un-mentioned tags.
|
||||
|
||||
This is, conceptually, directly opposite to how PromQL deals with tags. In OpenTSDB, if a tag isn't explicitly mentioned, all values assocaited with that tag will be aggregated.
|
||||
|
||||
It is recommended to use `sum` for the first aggregation because it is relatively quick and should not cause any changes to the incoming data (because we collect each individual series).
|
||||
|
||||
##### Second Order Aggregations
|
||||
|
||||
Second-order aggregation (`1m-avg` in our example) defines any windowing that should occur before returning the data
|
||||
|
||||
It is recommended to match the stat collection interval so we again avoid transforming incoming data.
|
||||
|
||||
We do not allow for defining the "null value" portion of the rollup window (e.g. in the aggreagtion, `1m-avg-none`, the user cannot change `none`), as the goal of this tool is to avoid modifying incoming data.
|
||||
|
||||
#### Windows
|
||||
|
||||
There are two important windows we define in a retention string:
|
||||
1. the "chunk" range of each query
|
||||
2. The time range we will be querying on with that "chunk"
|
||||
|
||||
From our example, our windows are `1h:30d`.
|
||||
|
||||
##### Window "chunks"
|
||||
|
||||
The window `1h` means that each individual query to OpenTSDB should only span 1 hour of time (e.g. `start=2h-ago&end=1h-ago`).
|
||||
|
||||
It is important to ensure this window somewhat matches the row size in HBase to help improve query times.
|
||||
|
||||
For example, if the query is hitting a rollup table with a 4 hour row size, we should set a chunk size of a multiple of 4 hours (e.g. `4h`, `8h`, etc.) to avoid requesting data across row boundaries. Landing on row boundaries allows for more consistent request times to HBase.
|
||||
|
||||
The default table created in HBase for OpenTSDB has a 1 hour row size, so if you aren't sure on a correct row size to use, `1h` is a reasonable choice.
|
||||
|
||||
##### Time range
|
||||
|
||||
The time range `30d` simply means we are asking for the last 30 days of data. This time range can be written using `h`, `d`, `w`, or `y`. (We can't use `m` for month because it already means `minute` in time parsing).
|
||||
|
||||
#### Results of retention string
|
||||
|
||||
The resultant queries that will be created, based on our example retention string of `sum-1m-avg:1h:30d` look like this:
|
||||
|
||||
```
|
||||
http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:<series>
|
||||
http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:<series>
|
||||
http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:<series>
|
||||
...
|
||||
http://opentsdb:4242/api/query?start=721h-ago&end=720h-ago&m=sum:1m-avg-none:<series>
|
||||
```
|
||||
|
||||
Chunking the data like this means each individual query returns faster, so we can start populating data into VictoriaMetrics quicker.
|
||||
|
||||
### Restarting OpenTSDB migrations
|
||||
|
||||
One important note for OpenTSDB migration: Queries/HBase scans can "get stuck" within OpenTSDB itself. This can cause instability and performance issues within an OpenTSDB cluster, so stopping the migrator to deal with it may be necessary. Because of this, we provide the timstamp we started collecting data from at thebeginning of the run. You can stop and restart the importer using this "hard timestamp" to ensure you collect data from the same time range over multiple runs.
|
||||
|
||||
## Migrating data from InfluxDB (1.x)
|
||||
|
||||
@@ -96,7 +175,7 @@ See `./vmctl influx --help` for details and full list of flags.
|
||||
|
||||
To use migration tool please specify the InfluxDB address `--influx-addr`, the database `--influx-database` and VictoriaMetrics address `--vm-addr`.
|
||||
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
|
||||
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
@@ -191,14 +270,14 @@ You may find useful a 3rd party solution for this - https://github.com/jonppe/in
|
||||
## Migrating data from Prometheus
|
||||
|
||||
`vmctl` supports the `prometheus` mode for migrating data from Prometheus to VictoriaMetrics time-series database.
|
||||
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
|
||||
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
|
||||
|
||||
See `./vmctl prometheus --help` for details and full list of flags.
|
||||
See `./vmctl prometheus --help` for details and full list of flags. Also see Prometheus related articles [here](#articles).
|
||||
|
||||
To use migration tool please specify the path to Prometheus snapshot `--prom-snapshot` and VictoriaMetrics address `--vm-addr`.
|
||||
More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data).
|
||||
To use migration tool please specify the file path to Prometheus snapshot `--prom-snapshot` (see how to make a snapshot [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data)) and VictoriaMetrics address `--vm-addr`.
|
||||
Please note, that `vmctl` *do not make a snapshot from Prometheus*, it uses an already prepared snapshot. More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data) and [here](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043).
|
||||
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
|
||||
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
|
||||
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
|
||||
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
|
||||
|
||||
@@ -359,7 +438,7 @@ then import it into VM using `vmctl` in `prometheus` mode.
|
||||
|
||||
### Native protocol
|
||||
|
||||
The [native binary protocol](https://victoriametrics.github.io/#how-to-export-data-in-native-format)
|
||||
The [native binary protocol](https://docs.victoriametrics.com/#how-to-export-data-in-native-format)
|
||||
was introduced in [1.42.0 release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.42.0)
|
||||
and provides the most efficient way to migrate data between VM instances: single to single, cluster to cluster,
|
||||
single to cluster and vice versa. Please note that both instances (source and destination) should be of v1.42.0
|
||||
|
||||
@@ -39,7 +39,7 @@ var (
|
||||
Name: vmAddr,
|
||||
Value: "http://localhost:8428",
|
||||
Usage: "VictoriaMetrics address to perform import requests. \n" +
|
||||
"Should be the same as --httpListenAddr value for single-node version or VMInsert component. \n" +
|
||||
"Should be the same as --httpListenAddr value for single-node version or vminsert component. \n" +
|
||||
"Please note, that `vmctl` performs initial readiness check for the given address by checking `/health` endpoint.",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
@@ -96,6 +96,78 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
const (
|
||||
otsdbAddr = "otsdb-addr"
|
||||
otsdbConcurrency = "otsdb-concurrency"
|
||||
otsdbQueryLimit = "otsdb-query-limit"
|
||||
otsdbOffsetDays = "otsdb-offset-days"
|
||||
otsdbHardTSStart = "otsdb-hard-ts-start"
|
||||
otsdbRetentions = "otsdb-retentions"
|
||||
otsdbFilters = "otsdb-filters"
|
||||
otsdbNormalize = "otsdb-normalize"
|
||||
otsdbMsecsTime = "otsdb-msecstime"
|
||||
)
|
||||
|
||||
var (
|
||||
otsdbFlags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: otsdbAddr,
|
||||
Value: "http://localhost:4242",
|
||||
Required: true,
|
||||
Usage: "OpenTSDB server addr",
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: otsdbConcurrency,
|
||||
Usage: "Number of concurrently running fetch queries to OpenTSDB per metric",
|
||||
Value: 1,
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: otsdbRetentions,
|
||||
Value: nil,
|
||||
Required: true,
|
||||
Usage: "Retentions patterns to collect on. Each pattern should describe the aggregation performed " +
|
||||
"for the query, the row size (in HBase) that will define how long each individual query is, " +
|
||||
"and the time range to query for. e.g. sum-1m-avg:1h:3d. " +
|
||||
"The first time range defined should be a multiple of the row size in HBase. " +
|
||||
"e.g. if the row size is 2 hours, 4h is good, 5h less so. We want each query to land on unique rows.",
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: otsdbFilters,
|
||||
Value: cli.NewStringSlice("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"),
|
||||
Usage: "Filters to process for discovering metrics in OpenTSDB",
|
||||
},
|
||||
&cli.Int64Flag{
|
||||
Name: otsdbOffsetDays,
|
||||
Usage: "Days to offset our 'starting' point for collecting data from OpenTSDB",
|
||||
Value: 0,
|
||||
},
|
||||
&cli.Int64Flag{
|
||||
Name: otsdbHardTSStart,
|
||||
Usage: "A specific timestamp to start from, will override using an offset",
|
||||
Value: 0,
|
||||
},
|
||||
/*
|
||||
because the defaults are set *extremely* low in OpenTSDB (10-25 results), we will
|
||||
set a larger default limit, but still allow a user to increase/decrease it
|
||||
*/
|
||||
&cli.IntFlag{
|
||||
Name: otsdbQueryLimit,
|
||||
Usage: "Result limit on meta queries to OpenTSDB (affects both metric name and tag value queries, recommended to use a value exceeding your largest series)",
|
||||
Value: 100e3,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: otsdbMsecsTime,
|
||||
Value: false,
|
||||
Usage: "Whether OpenTSDB is writing values in milliseconds or seconds",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: otsdbNormalize,
|
||||
Value: false,
|
||||
Usage: "Whether to normalize all data received to lower case before forwarding to VictoriaMetrics",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
const (
|
||||
influxAddr = "influx-addr"
|
||||
influxUser = "influx-user"
|
||||
@@ -243,8 +315,8 @@ var (
|
||||
&cli.StringFlag{
|
||||
Name: vmNativeSrcAddr,
|
||||
Usage: "VictoriaMetrics address to perform export from. \n" +
|
||||
" Should be the same as --httpListenAddr value for single-node version or VMSelect component." +
|
||||
" If exporting from cluster version - include the tenet token in address.",
|
||||
" Should be the same as --httpListenAddr value for single-node version or vmselect component." +
|
||||
" If exporting from cluster version see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format",
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
@@ -260,8 +332,8 @@ var (
|
||||
&cli.StringFlag{
|
||||
Name: vmNativeDstAddr,
|
||||
Usage: "VictoriaMetrics address to perform import to. \n" +
|
||||
" Should be the same as --httpListenAddr value for single-node version or VMInsert component." +
|
||||
" If importing into cluster version - include the tenet token in address.",
|
||||
" Should be the same as --httpListenAddr value for single-node version or vminsert component." +
|
||||
" If importing into cluster version see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format",
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
|
||||
@@ -61,7 +61,7 @@ func (s Series) fetchQuery(timeFilter string) string {
|
||||
}
|
||||
for i, pair := range s.LabelPairs {
|
||||
pairV := valueEscaper.Replace(pair.Value)
|
||||
fmt.Fprintf(f, " %q='%s'", pair.Name, pairV)
|
||||
fmt.Fprintf(f, " %q::tag='%s'", pair.Name, pairV)
|
||||
if i != len(s.LabelPairs)-1 {
|
||||
f.WriteString(" and")
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ func TestFetchQuery(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: `select "value" from "cpu" where "foo"='bar'`,
|
||||
expected: `select "value" from "cpu" where "foo"::tag='bar'`,
|
||||
},
|
||||
{
|
||||
s: Series{
|
||||
@@ -36,7 +36,7 @@ func TestFetchQuery(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: `select "value" from "cpu" where "foo"='bar' and "baz"='qux'`,
|
||||
expected: `select "value" from "cpu" where "foo"::tag='bar' and "baz"::tag='qux'`,
|
||||
},
|
||||
{
|
||||
s: Series{
|
||||
@@ -50,7 +50,7 @@ func TestFetchQuery(t *testing.T) {
|
||||
},
|
||||
},
|
||||
timeFilter: "time >= now()",
|
||||
expected: `select "value" from "cpu" where "foo"='b\'ar' and time >= now()`,
|
||||
expected: `select "value" from "cpu" where "foo"::tag='b\'ar' and time >= now()`,
|
||||
},
|
||||
{
|
||||
s: Series{
|
||||
@@ -68,7 +68,7 @@ func TestFetchQuery(t *testing.T) {
|
||||
},
|
||||
},
|
||||
timeFilter: "time >= now()",
|
||||
expected: `select "value" from "cpu" where "name"='dev-mapper-centos\\x2dswap.swap' and "state"='dev-mapp\'er-c\'en\'tos' and time >= now()`,
|
||||
expected: `select "value" from "cpu" where "name"::tag='dev-mapper-centos\\x2dswap.swap' and "state"::tag='dev-mapp\'er-c\'en\'tos' and time >= now()`,
|
||||
},
|
||||
{
|
||||
s: Series{
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/influx"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
@@ -20,9 +21,41 @@ func main() {
|
||||
start := time.Now()
|
||||
app := &cli.App{
|
||||
Name: "vmctl",
|
||||
Usage: "Victoria metrics command-line tool",
|
||||
Usage: "VictoriaMetrics command-line tool",
|
||||
Version: buildinfo.Version,
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
Name: "opentsdb",
|
||||
Usage: "Migrate timeseries from OpenTSDB",
|
||||
Flags: mergeFlags(globalFlags, otsdbFlags, vmFlags),
|
||||
Action: func(c *cli.Context) error {
|
||||
fmt.Println("OpenTSDB import mode")
|
||||
|
||||
oCfg := opentsdb.Config{
|
||||
Addr: c.String(otsdbAddr),
|
||||
Limit: c.Int(otsdbQueryLimit),
|
||||
Offset: c.Int64(otsdbOffsetDays),
|
||||
HardTS: c.Int64(otsdbHardTSStart),
|
||||
Retentions: c.StringSlice(otsdbRetentions),
|
||||
Filters: c.StringSlice(otsdbFilters),
|
||||
Normalize: c.Bool(otsdbNormalize),
|
||||
MsecsTime: c.Bool(otsdbMsecsTime),
|
||||
}
|
||||
otsdbClient, err := opentsdb.NewClient(oCfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create opentsdb client: %s", err)
|
||||
}
|
||||
|
||||
vmCfg := initConfigVM(c)
|
||||
importer, err := vm.NewImporter(vmCfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create VM importer: %s", err)
|
||||
}
|
||||
|
||||
otsdbProcessor := newOtsdbProcessor(otsdbClient, importer, c.Int(otsdbConcurrency))
|
||||
return otsdbProcessor.run(c.Bool(globalSilent))
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "influx",
|
||||
Usage: "Migrate timeseries from InfluxDB",
|
||||
|
||||
164
app/vmctl/opentsdb.go
Normal file
164
app/vmctl/opentsdb.go
Normal file
@@ -0,0 +1,164 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
)
|
||||
|
||||
type otsdbProcessor struct {
|
||||
oc *opentsdb.Client
|
||||
im *vm.Importer
|
||||
otsdbcc int
|
||||
}
|
||||
|
||||
type queryObj struct {
|
||||
Series opentsdb.Meta
|
||||
Rt opentsdb.RetentionMeta
|
||||
Tr opentsdb.TimeRange
|
||||
StartTime int64
|
||||
}
|
||||
|
||||
func newOtsdbProcessor(oc *opentsdb.Client, im *vm.Importer, otsdbcc int) *otsdbProcessor {
|
||||
if otsdbcc < 1 {
|
||||
otsdbcc = 1
|
||||
}
|
||||
return &otsdbProcessor{
|
||||
oc: oc,
|
||||
im: im,
|
||||
otsdbcc: otsdbcc,
|
||||
}
|
||||
}
|
||||
|
||||
func (op *otsdbProcessor) run(silent bool) error {
|
||||
log.Println("Loading all metrics from OpenTSDB for filters: ", op.oc.Filters)
|
||||
var metrics []string
|
||||
for _, filter := range op.oc.Filters {
|
||||
q := fmt.Sprintf("%s/api/suggest?type=metrics&q=%s&max=%d", op.oc.Addr, filter, op.oc.Limit)
|
||||
m, err := op.oc.FindMetrics(q)
|
||||
if err != nil {
|
||||
return fmt.Errorf("metric discovery failed for %q: %s", q, err)
|
||||
}
|
||||
metrics = append(metrics, m...)
|
||||
}
|
||||
if len(metrics) < 1 {
|
||||
return fmt.Errorf("found no timeseries to import with filters %q", op.oc.Filters)
|
||||
}
|
||||
|
||||
question := fmt.Sprintf("Found %d metrics to import. Continue?", len(metrics))
|
||||
if !silent && !prompt(question) {
|
||||
return nil
|
||||
}
|
||||
op.im.ResetStats()
|
||||
var startTime int64
|
||||
if op.oc.HardTS != 0 {
|
||||
startTime = op.oc.HardTS
|
||||
} else {
|
||||
startTime = time.Now().Unix()
|
||||
}
|
||||
queryRanges := 0
|
||||
// pre-calculate the number of query ranges we'll be processing
|
||||
for _, rt := range op.oc.Retentions {
|
||||
queryRanges += len(rt.QueryRanges)
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
log.Println(fmt.Sprintf("Starting work on %s", metric))
|
||||
serieslist, err := op.oc.FindSeries(metric)
|
||||
if err != nil {
|
||||
return fmt.Errorf("couldn't retrieve series list for %s : %s", metric, err)
|
||||
}
|
||||
/*
|
||||
Create channels for collecting/processing series and errors
|
||||
We'll create them per metric to reduce pressure against OpenTSDB
|
||||
|
||||
Limit the size of seriesCh so we can't get too far ahead of actual processing
|
||||
*/
|
||||
seriesCh := make(chan queryObj, op.otsdbcc)
|
||||
errCh := make(chan error)
|
||||
// we're going to make serieslist * queryRanges queries, so we should represent that in the progress bar
|
||||
bar := pb.StartNew(len(serieslist) * queryRanges)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(op.otsdbcc)
|
||||
for i := 0; i < op.otsdbcc; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for s := range seriesCh {
|
||||
if err := op.do(s); err != nil {
|
||||
errCh <- fmt.Errorf("couldn't retrieve series for %s : %s", metric, err)
|
||||
return
|
||||
}
|
||||
bar.Increment()
|
||||
}
|
||||
}()
|
||||
}
|
||||
/*
|
||||
Loop through all series for this metric, processing all retentions and time ranges
|
||||
requested. This loop is our primary "collect data from OpenTSDB loop" and should
|
||||
be async, sending data to VictoriaMetrics over time.
|
||||
|
||||
The idea with having the select at the inner-most loop is to ensure quick
|
||||
short-circuiting on error.
|
||||
*/
|
||||
for _, series := range serieslist {
|
||||
for _, rt := range op.oc.Retentions {
|
||||
for _, tr := range rt.QueryRanges {
|
||||
select {
|
||||
case otsdbErr := <-errCh:
|
||||
return fmt.Errorf("opentsdb error: %s", otsdbErr)
|
||||
case vmErr := <-op.im.Errors():
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
case seriesCh <- queryObj{
|
||||
Tr: tr, StartTime: startTime,
|
||||
Series: series, Rt: opentsdb.RetentionMeta{
|
||||
FirstOrder: rt.FirstOrder, SecondOrder: rt.SecondOrder, AggTime: rt.AggTime}}:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Drain channels per metric
|
||||
close(seriesCh)
|
||||
wg.Wait()
|
||||
close(errCh)
|
||||
// check for any lingering errors on the query side
|
||||
for otsdbErr := range errCh {
|
||||
return fmt.Errorf("Import process failed: \n%s", otsdbErr)
|
||||
}
|
||||
bar.Finish()
|
||||
log.Print(op.im.Stats())
|
||||
}
|
||||
op.im.Close()
|
||||
for vmErr := range op.im.Errors() {
|
||||
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
|
||||
}
|
||||
log.Println("Import finished!")
|
||||
log.Print(op.im.Stats())
|
||||
return nil
|
||||
}
|
||||
|
||||
func (op *otsdbProcessor) do(s queryObj) error {
|
||||
start := s.StartTime - s.Tr.Start
|
||||
end := s.StartTime - s.Tr.End
|
||||
data, err := op.oc.GetData(s.Series, s.Rt, start, end)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
|
||||
}
|
||||
if len(data.Timestamps) < 1 || len(data.Values) < 1 {
|
||||
return nil
|
||||
}
|
||||
labels := make([]vm.LabelPair, len(data.Tags))
|
||||
for k, v := range data.Tags {
|
||||
labels = append(labels, vm.LabelPair{Name: k, Value: v})
|
||||
}
|
||||
op.im.Input() <- &vm.TimeSeries{
|
||||
Name: data.Metric,
|
||||
LabelPairs: labels,
|
||||
Timestamps: data.Timestamps,
|
||||
Values: data.Values,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
349
app/vmctl/opentsdb/opentsdb.go
Normal file
349
app/vmctl/opentsdb/opentsdb.go
Normal file
@@ -0,0 +1,349 @@
|
||||
package opentsdb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Retention objects contain meta data about what to query for our run
|
||||
type Retention struct {
|
||||
/*
|
||||
OpenTSDB has two levels of aggregation,
|
||||
First, we aggregate any un-mentioned tags into the last result
|
||||
Second, we aggregate into buckets over time
|
||||
To simulate this with config, we have
|
||||
FirstOrder (e.g. sum/avg/max/etc.)
|
||||
SecondOrder (e.g. sum/avg/max/etc.)
|
||||
AggTime (e.g. 1m/10m/1d/etc.)
|
||||
This will build into m=<FirstOrder>:<AggTime>-<SecondOrder>-none:
|
||||
Or an example: m=sum:1m-avg-none
|
||||
*/
|
||||
FirstOrder string
|
||||
SecondOrder string
|
||||
AggTime string
|
||||
// The actual ranges will will attempt to query (as offsets from now)
|
||||
QueryRanges []TimeRange
|
||||
}
|
||||
|
||||
// RetentionMeta objects exist to pass smaller subsets (only one retention range) of a full Retention object around
|
||||
type RetentionMeta struct {
|
||||
FirstOrder string
|
||||
SecondOrder string
|
||||
AggTime string
|
||||
}
|
||||
|
||||
// Client object holds general config about how queries should be performed
|
||||
type Client struct {
|
||||
Addr string
|
||||
// The meta query limit for series returned
|
||||
Limit int
|
||||
Retentions []Retention
|
||||
Filters []string
|
||||
Normalize bool
|
||||
HardTS int64
|
||||
}
|
||||
|
||||
// Config contains fields required
|
||||
// for Client configuration
|
||||
type Config struct {
|
||||
Addr string
|
||||
Limit int
|
||||
Offset int64
|
||||
HardTS int64
|
||||
Retentions []string
|
||||
Filters []string
|
||||
Normalize bool
|
||||
MsecsTime bool
|
||||
}
|
||||
|
||||
// TimeRange contains data about time ranges to query
|
||||
type TimeRange struct {
|
||||
Start int64
|
||||
End int64
|
||||
}
|
||||
|
||||
// MetaResults contains return data from search series lookup queries
|
||||
type MetaResults struct {
|
||||
Type string `json:"type"`
|
||||
Results []Meta `json:"results"`
|
||||
//metric string
|
||||
//tags interface{}
|
||||
//limit int
|
||||
//time int
|
||||
//startIndex int
|
||||
//totalResults int
|
||||
}
|
||||
|
||||
// Meta A meta object about a metric
|
||||
// only contain the tags/etc. and no data
|
||||
type Meta struct {
|
||||
//tsuid string
|
||||
Metric string `json:"metric"`
|
||||
Tags map[string]string `json:"tags"`
|
||||
}
|
||||
|
||||
// Metric holds the time series data
|
||||
type Metric struct {
|
||||
Metric string
|
||||
Tags map[string]string
|
||||
Timestamps []int64
|
||||
Values []float64
|
||||
}
|
||||
|
||||
// ExpressionOutput contains results from actual data queries
|
||||
type ExpressionOutput struct {
|
||||
Outputs []qoObj `json:"outputs"`
|
||||
Query interface{} `json:"query"`
|
||||
}
|
||||
|
||||
// QoObj contains actual timeseries data from the returned data query
|
||||
type qoObj struct {
|
||||
ID string `json:"id"`
|
||||
Alias string `json:"alias"`
|
||||
Dps [][]float64 `json:"dps"`
|
||||
//dpsMeta interface{}
|
||||
//meta interface{}
|
||||
}
|
||||
|
||||
// Expression objects format our data queries
|
||||
/*
|
||||
All of the following structs are to build a OpenTSDB expression object
|
||||
*/
|
||||
type Expression struct {
|
||||
Time timeObj `json:"time"`
|
||||
Filters []filterObj `json:"filters"`
|
||||
Metrics []metricObj `json:"metrics"`
|
||||
// this just needs to be an empty object, so the value doesn't matter
|
||||
Expressions []int `json:"expressions"`
|
||||
Outputs []outputObj `json:"outputs"`
|
||||
}
|
||||
|
||||
type timeObj struct {
|
||||
Start int64 `json:"start"`
|
||||
End int64 `json:"end"`
|
||||
Aggregator string `json:"aggregator"`
|
||||
Downsampler dSObj `json:"downsampler"`
|
||||
}
|
||||
|
||||
type dSObj struct {
|
||||
Interval string `json:"interval"`
|
||||
Aggregator string `json:"aggregator"`
|
||||
FillPolicy fillObj `json:"fillPolicy"`
|
||||
}
|
||||
|
||||
type fillObj struct {
|
||||
// we'll always hard-code to NaN here, so we don't need value
|
||||
Policy string `json:"policy"`
|
||||
}
|
||||
|
||||
type filterObj struct {
|
||||
Tags []tagObj `json:"tags"`
|
||||
ID string `json:"id"`
|
||||
}
|
||||
|
||||
type tagObj struct {
|
||||
Type string `json:"type"`
|
||||
Tagk string `json:"tagk"`
|
||||
Filter string `json:"filter"`
|
||||
GroupBy bool `json:"groupBy"`
|
||||
}
|
||||
|
||||
type metricObj struct {
|
||||
ID string `json:"id"`
|
||||
Metric string `json:"metric"`
|
||||
Filter string `json:"filter"`
|
||||
FillPolicy fillObj `json:"fillPolicy"`
|
||||
}
|
||||
|
||||
type outputObj struct {
|
||||
ID string `json:"id"`
|
||||
Alias string `json:"alias"`
|
||||
}
|
||||
|
||||
/* End expression object structs */
|
||||
|
||||
var (
|
||||
exprOutput = outputObj{ID: "a", Alias: "query"}
|
||||
exprFillPolicy = fillObj{Policy: "nan"}
|
||||
)
|
||||
|
||||
// FindMetrics discovers all metrics that OpenTSDB knows about (given a filter)
|
||||
// e.g. /api/suggest?type=metrics&q=system&max=100000
|
||||
func (c Client) FindMetrics(q string) ([]string, error) {
|
||||
resp, err := http.Get(q)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not retrieve metric data from %q: %s", q, err)
|
||||
}
|
||||
var metriclist []string
|
||||
err = json.Unmarshal(body, &metriclist)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response from %q: %s", q, err)
|
||||
}
|
||||
return metriclist, nil
|
||||
}
|
||||
|
||||
// FindSeries discovers all series associated with a metric
|
||||
// e.g. /api/search/lookup?m=system.load5&limit=1000000
|
||||
func (c Client) FindSeries(metric string) ([]Meta, error) {
|
||||
q := fmt.Sprintf("%s/api/search/lookup?m=%s&limit=%d", c.Addr, metric, c.Limit)
|
||||
resp, err := http.Get(q)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to set GET request to %q: %s", q, err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
|
||||
}
|
||||
var results MetaResults
|
||||
err = json.Unmarshal(body, &results)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response from %q: %s", q, err)
|
||||
}
|
||||
return results.Results, nil
|
||||
}
|
||||
|
||||
// GetData actually retrieves data for a series at a specified time range
|
||||
func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (Metric, error) {
|
||||
/*
|
||||
Here we build the actual exp query we'll send to OpenTSDB
|
||||
|
||||
This is comprised of a number of different settings. We hard-code
|
||||
a few to simplify the JSON object creation.
|
||||
There are examples queries available, so not too much detail here...
|
||||
*/
|
||||
expr := Expression{}
|
||||
expr.Outputs = []outputObj{exprOutput}
|
||||
expr.Metrics = append(expr.Metrics, metricObj{ID: "a", Metric: series.Metric,
|
||||
Filter: "f1", FillPolicy: exprFillPolicy})
|
||||
expr.Time = timeObj{Start: start, End: end, Aggregator: rt.FirstOrder,
|
||||
Downsampler: dSObj{Interval: rt.AggTime,
|
||||
Aggregator: rt.SecondOrder,
|
||||
FillPolicy: exprFillPolicy}}
|
||||
var TagList []tagObj
|
||||
for k, v := range series.Tags {
|
||||
/*
|
||||
every tag should be a literal_or because that's the closest to a full "==" that
|
||||
this endpoint allows for
|
||||
*/
|
||||
TagList = append(TagList, tagObj{Type: "literal_or", Tagk: k,
|
||||
Filter: v, GroupBy: true})
|
||||
}
|
||||
expr.Filters = append(expr.Filters, filterObj{ID: "f1", Tags: TagList})
|
||||
// "expressions" is required in the query object or we get a 5xx, so force it to exist
|
||||
expr.Expressions = make([]int, 0)
|
||||
inputData, err := json.Marshal(expr)
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("failed to marshal query JSON %s", err)
|
||||
}
|
||||
|
||||
q := fmt.Sprintf("%s/api/query/exp", c.Addr)
|
||||
resp, err := http.Post(q, "application/json", bytes.NewBuffer(inputData))
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("failed to send GET request to %q: %s", q, err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
return Metric{}, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
|
||||
}
|
||||
var output ExpressionOutput
|
||||
err = json.Unmarshal(body, &output)
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("failed to unmarshal response from %q: %s", q, err)
|
||||
}
|
||||
if len(output.Outputs) < 1 {
|
||||
// no results returned...return an empty object without error
|
||||
return Metric{}, nil
|
||||
}
|
||||
data := Metric{}
|
||||
data.Metric = series.Metric
|
||||
data.Tags = series.Tags
|
||||
/*
|
||||
We evaluate data for correctness before formatting the actual values
|
||||
to skip a little bit of time if the series has invalid formatting
|
||||
|
||||
First step is to enforce Prometheus' data model
|
||||
*/
|
||||
data, err = modifyData(data, c.Normalize)
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("invalid series data from %q: %s", q, err)
|
||||
}
|
||||
/*
|
||||
Convert data from OpenTSDB's output format ([[ts,val],[ts,val]...])
|
||||
to VictoriaMetrics format: {"timestamps": [ts,ts,ts...], "values": [val,val,val...]}
|
||||
The nasty part here is that because an object in each array
|
||||
can be a float64, we have to initially cast _all_ objects that way
|
||||
then convert the timestamp back to something reasonable.
|
||||
*/
|
||||
for _, tsobj := range output.Outputs[0].Dps {
|
||||
data.Timestamps = append(data.Timestamps, int64(tsobj[0]))
|
||||
data.Values = append(data.Values, tsobj[1])
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// NewClient creates and returns OpenTSDB client
|
||||
// configured with passed Config
|
||||
func NewClient(cfg Config) (*Client, error) {
|
||||
var retentions []Retention
|
||||
offsetPrint := int64(time.Now().Unix())
|
||||
if cfg.MsecsTime {
|
||||
// 1000000 == Nanoseconds -> Milliseconds difference
|
||||
offsetPrint = int64(time.Now().UnixNano() / 1000000)
|
||||
}
|
||||
if cfg.HardTS > 0 {
|
||||
/*
|
||||
HardTS is a specific timestamp we'll be starting at.
|
||||
Just present that if it is defined
|
||||
*/
|
||||
offsetPrint = cfg.HardTS
|
||||
} else if cfg.Offset > 0 {
|
||||
/*
|
||||
Our "offset" is the number of days we should step
|
||||
back before starting to scan for data
|
||||
*/
|
||||
if cfg.MsecsTime {
|
||||
offsetPrint = offsetPrint - (cfg.Offset * 24 * 60 * 60 * 1000)
|
||||
} else {
|
||||
offsetPrint = offsetPrint - (cfg.Offset * 24 * 60 * 60)
|
||||
}
|
||||
}
|
||||
log.Println(fmt.Sprintf("Will collect data starting at TS %v", offsetPrint))
|
||||
for _, r := range cfg.Retentions {
|
||||
ret, err := convertRetention(r, cfg.Offset, cfg.MsecsTime)
|
||||
if err != nil {
|
||||
return &Client{}, fmt.Errorf("Couldn't parse retention %q :: %v", r, err)
|
||||
}
|
||||
retentions = append(retentions, ret)
|
||||
}
|
||||
client := &Client{
|
||||
Addr: strings.Trim(cfg.Addr, "/"),
|
||||
Retentions: retentions,
|
||||
Limit: cfg.Limit,
|
||||
Filters: cfg.Filters,
|
||||
Normalize: cfg.Normalize,
|
||||
HardTS: cfg.HardTS,
|
||||
}
|
||||
return client, nil
|
||||
}
|
||||
173
app/vmctl/opentsdb/parser.go
Normal file
173
app/vmctl/opentsdb/parser.go
Normal file
@@ -0,0 +1,173 @@
|
||||
package opentsdb
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
allowedNames = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9_:]*$")
|
||||
allowedFirstChar = regexp.MustCompile("^[a-zA-Z]")
|
||||
replaceChars = regexp.MustCompile("[^a-zA-Z0-9_:]")
|
||||
allowedTagKeys = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9_]*$")
|
||||
)
|
||||
|
||||
func convertDuration(duration string) (time.Duration, error) {
|
||||
/*
|
||||
Golang's time library doesn't support many different
|
||||
string formats (year, month, week, day) because they
|
||||
aren't consistent ranges. But Java's library _does_.
|
||||
Consequently, we'll need to handle all the custom
|
||||
time ranges, and, to make the internal API call consistent,
|
||||
we'll need to allow for durations that Go supports, too.
|
||||
|
||||
The nice thing is all the "broken" time ranges are > 1 hour,
|
||||
so we can just make assumptions to convert them to a range in hours.
|
||||
They aren't *good* assumptions, but they're reasonable
|
||||
for this function.
|
||||
*/
|
||||
var actualDuration time.Duration
|
||||
var err error
|
||||
var timeValue int
|
||||
if strings.HasSuffix(duration, "y") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "y"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
timeValue = timeValue * 365 * 24
|
||||
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else if strings.HasSuffix(duration, "w") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "w"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
timeValue = timeValue * 7 * 24
|
||||
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else if strings.HasSuffix(duration, "d") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "d"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
timeValue = timeValue * 24
|
||||
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else if strings.HasSuffix(duration, "h") || strings.HasSuffix(duration, "m") || strings.HasSuffix(duration, "s") || strings.HasSuffix(duration, "ms") {
|
||||
actualDuration, err = time.ParseDuration(duration)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else {
|
||||
return 0, fmt.Errorf("invalid time duration string: %q", duration)
|
||||
}
|
||||
return actualDuration, nil
|
||||
}
|
||||
|
||||
// Convert an incoming retention "string" into the component parts
|
||||
func convertRetention(retention string, offset int64, msecTime bool) (Retention, error) {
|
||||
/*
|
||||
A retention string coming in looks like
|
||||
sum-1m-avg:1h:30d
|
||||
So we:
|
||||
1. split on the :
|
||||
2. split on the - in slice 0
|
||||
3. create the time ranges we actually need
|
||||
*/
|
||||
chunks := strings.Split(retention, ":")
|
||||
if len(chunks) != 3 {
|
||||
return Retention{}, fmt.Errorf("invalid retention string: %q", retention)
|
||||
}
|
||||
rowLengthDuration, err := convertDuration(chunks[1])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid row length (first order) duration string: %q: %s", chunks[1], err)
|
||||
}
|
||||
// set length of each row in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
|
||||
rowLength := rowLengthDuration.Milliseconds()
|
||||
if !msecTime {
|
||||
rowLength = rowLength / 1000
|
||||
}
|
||||
ttlDuration, err := convertDuration(chunks[2])
|
||||
if err != nil {
|
||||
return Retention{}, fmt.Errorf("invalid ttl (second order) duration string: %q: %s", chunks[2], err)
|
||||
}
|
||||
// set ttl in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
|
||||
ttl := ttlDuration.Milliseconds()
|
||||
if !msecTime {
|
||||
ttl = ttl / 1000
|
||||
}
|
||||
// bump by the offset so we don't look at empty ranges any time offset > ttl
|
||||
ttl += offset
|
||||
var timeChunks []TimeRange
|
||||
var i int64
|
||||
for i = offset; i <= ttl; i = i + rowLength {
|
||||
timeChunks = append(timeChunks, TimeRange{Start: i + rowLength, End: i})
|
||||
}
|
||||
// first/second order aggregations for queries defined in chunk 0...
|
||||
aggregates := strings.Split(chunks[0], "-")
|
||||
if len(aggregates) != 3 {
|
||||
return Retention{}, fmt.Errorf("invalid aggregation string: %q", chunks[0])
|
||||
}
|
||||
|
||||
ret := Retention{FirstOrder: aggregates[0],
|
||||
SecondOrder: aggregates[2],
|
||||
AggTime: aggregates[1],
|
||||
QueryRanges: timeChunks}
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
// This ensures any incoming data from OpenTSDB matches the Prometheus data model
|
||||
// https://prometheus.io/docs/concepts/data_model
|
||||
func modifyData(msg Metric, normalize bool) (Metric, error) {
|
||||
finalMsg := Metric{
|
||||
Metric: "", Tags: make(map[string]string),
|
||||
Timestamps: msg.Timestamps, Values: msg.Values,
|
||||
}
|
||||
// if the metric name has invalid characters, the data model says to drop it
|
||||
if !allowedFirstChar.MatchString(msg.Metric) {
|
||||
return Metric{}, fmt.Errorf("%s has a bad first character", msg.Metric)
|
||||
}
|
||||
name := msg.Metric
|
||||
// if normalization requested, lowercase the name
|
||||
if normalize {
|
||||
name = strings.ToLower(name)
|
||||
}
|
||||
/*
|
||||
replace bad characters in metric name with _ per the data model
|
||||
only replace if needed to reduce string processing time
|
||||
*/
|
||||
if !allowedNames.MatchString(name) {
|
||||
finalMsg.Metric = replaceChars.ReplaceAllString(name, "_")
|
||||
} else {
|
||||
finalMsg.Metric = name
|
||||
}
|
||||
// replace bad characters in tag keys with _ per the data model
|
||||
for key, value := range msg.Tags {
|
||||
// if normalization requested, lowercase the key and value
|
||||
if normalize {
|
||||
key = strings.ToLower(key)
|
||||
value = strings.ToLower(value)
|
||||
}
|
||||
/*
|
||||
replace all explicitly bad characters with _
|
||||
only replace if needed to reduce string processing time
|
||||
*/
|
||||
if !allowedTagKeys.MatchString(key) {
|
||||
key = replaceChars.ReplaceAllString(key, "_")
|
||||
}
|
||||
// tags that start with __ are considered custom stats for internal prometheus stuff, we should drop them
|
||||
if !strings.HasPrefix(key, "__") {
|
||||
finalMsg.Tags[key] = value
|
||||
}
|
||||
}
|
||||
return finalMsg, nil
|
||||
}
|
||||
217
app/vmctl/opentsdb/parser_test.go
Normal file
217
app/vmctl/opentsdb/parser_test.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package opentsdb
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConvertRetention(t *testing.T) {
|
||||
/*
|
||||
2592000 seconds in 30 days
|
||||
3600 in one hour
|
||||
2592000 / 3600 = 720 individual query "ranges" should exist, plus one because time ranges can be weird
|
||||
First order should == "sum"
|
||||
Second order should == "avg"
|
||||
AggTime should == "1m"
|
||||
*/
|
||||
res, err := convertRetention("sum-1m-avg:1h:30d", 0, false)
|
||||
if err != nil {
|
||||
t.Fatalf("Error parsing valid retention string: %v", err)
|
||||
}
|
||||
if len(res.QueryRanges) != 721 {
|
||||
t.Fatalf("Found %v query ranges. Should have found 720", len(res.QueryRanges))
|
||||
}
|
||||
if res.FirstOrder != "sum" {
|
||||
t.Fatalf("Incorrect first order aggregation %q. Should have been 'sum'", res.FirstOrder)
|
||||
}
|
||||
if res.SecondOrder != "avg" {
|
||||
t.Fatalf("Incorrect second order aggregation %q. Should have been 'avg'", res.SecondOrder)
|
||||
}
|
||||
if res.AggTime != "1m" {
|
||||
t.Fatalf("Incorrect aggregation time length %q. Should have been '1m'", res.AggTime)
|
||||
}
|
||||
/*
|
||||
Invalid retention string
|
||||
*/
|
||||
res, err = convertRetention("sum-1m-avg:30d", 0, false)
|
||||
if err == nil {
|
||||
t.Fatalf("Bad retention string (sum-1m-avg:30d) didn't fail: %v", res)
|
||||
}
|
||||
/*
|
||||
Invalid aggregation string
|
||||
*/
|
||||
res, err = convertRetention("sum-1m:1h:30d", 0, false)
|
||||
if err == nil {
|
||||
t.Fatalf("Bad aggregation string (sum-1m:1h:30d) didn't fail: %v", res)
|
||||
}
|
||||
}
|
||||
|
||||
func TestModifyData(t *testing.T) {
|
||||
/*
|
||||
Good metric metadata
|
||||
*/
|
||||
m := Metric{
|
||||
Metric: "cpu",
|
||||
Tags: map[string]string{
|
||||
"core": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err := modifyData(m, false)
|
||||
if err != nil {
|
||||
t.Fatalf("Valid metric %v failed to parse: %v", m, err)
|
||||
}
|
||||
if res.Metric != "cpu" {
|
||||
t.Fatalf("Valid metric name %q was converted: %q", m.Metric, res.Metric)
|
||||
}
|
||||
found := false
|
||||
for k := range res.Tags {
|
||||
if k == "core" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("Valid metric tag name 'core' missing: %v", res.Tags)
|
||||
}
|
||||
|
||||
/*
|
||||
Bad first character in metric name
|
||||
metric names cannot start with _, so this
|
||||
metric should fail entirely
|
||||
*/
|
||||
m = Metric{
|
||||
Metric: "_cpu",
|
||||
Tags: map[string]string{
|
||||
"core": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err = modifyData(m, false)
|
||||
if err == nil {
|
||||
t.Fatalf("Invalid metric %v parsed?", m)
|
||||
}
|
||||
|
||||
/*
|
||||
Bad character in metric name
|
||||
metric names cannot have `.`, so this
|
||||
should be converted to `_`
|
||||
*/
|
||||
m = Metric{
|
||||
Metric: "cpu.name",
|
||||
Tags: map[string]string{
|
||||
"core": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err = modifyData(m, false)
|
||||
if err != nil {
|
||||
t.Fatalf("Valid metric failed to parse? %v", err)
|
||||
}
|
||||
if res.Metric != "cpu_name" {
|
||||
t.Fatalf("Metric name not correctly converted from 'cpu.name' to 'cpu_name': %q", res.Metric)
|
||||
}
|
||||
|
||||
/*
|
||||
bad tag prefix (__)
|
||||
Prometheus considers tags beginning with __
|
||||
to be internal use only. They should not show up in incoming data.
|
||||
this tag should be dropped from the result
|
||||
*/
|
||||
m = Metric{
|
||||
Metric: "cpu",
|
||||
Tags: map[string]string{
|
||||
"__core": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err = modifyData(m, false)
|
||||
if err != nil {
|
||||
t.Fatalf("Valid metric failed to parse? %v", err)
|
||||
}
|
||||
found = false
|
||||
for k := range res.Tags {
|
||||
if k == "__core" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if found {
|
||||
t.Fatalf("Bad tag key prefix (__) found")
|
||||
}
|
||||
|
||||
/*
|
||||
bad tag key
|
||||
tag keys cannot contain `.`, this should be
|
||||
replaced with `_`
|
||||
*/
|
||||
m = Metric{
|
||||
Metric: "cpu",
|
||||
Tags: map[string]string{
|
||||
"core.name": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err = modifyData(m, false)
|
||||
if err != nil {
|
||||
t.Fatalf("Valid metric failed to parse? %v", err)
|
||||
}
|
||||
found = false
|
||||
for k := range res.Tags {
|
||||
if k == "core.name" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if found {
|
||||
t.Fatalf("Bad tag key 'core.name' not converted")
|
||||
}
|
||||
|
||||
/*
|
||||
test normalize
|
||||
All characters should be returned lowercase
|
||||
*/
|
||||
m = Metric{
|
||||
Metric: "CPU",
|
||||
Tags: map[string]string{
|
||||
"core": "0",
|
||||
},
|
||||
Values: []float64{
|
||||
0,
|
||||
},
|
||||
Timestamps: []int64{
|
||||
0,
|
||||
},
|
||||
}
|
||||
res, err = modifyData(m, true)
|
||||
if err != nil {
|
||||
t.Fatalf("Valid metric failed to parse? %v", err)
|
||||
}
|
||||
if res.Metric != "cpu" {
|
||||
t.Fatalf("Normalization of metric name didn't happen!")
|
||||
}
|
||||
}
|
||||
398
app/vmctl/opentsdb/testdata/exampleOutput.json
vendored
Normal file
398
app/vmctl/opentsdb/testdata/exampleOutput.json
vendored
Normal file
@@ -0,0 +1,398 @@
|
||||
{
|
||||
"outputs": [
|
||||
{
|
||||
"id": "a",
|
||||
"alias": "query",
|
||||
"dps": [
|
||||
[
|
||||
1614099600000,
|
||||
0.28
|
||||
],
|
||||
[
|
||||
1614099660000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614099720000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614099780000,
|
||||
0.14
|
||||
],
|
||||
[
|
||||
1614099840000,
|
||||
0.24
|
||||
],
|
||||
[
|
||||
1614099900000,
|
||||
0.19
|
||||
],
|
||||
[
|
||||
1614099960000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614100020000,
|
||||
0.2
|
||||
],
|
||||
[
|
||||
1614100080000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614100140000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614100200000,
|
||||
0.17
|
||||
],
|
||||
[
|
||||
1614100260000,
|
||||
0.16
|
||||
],
|
||||
[
|
||||
1614100320000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614100380000,
|
||||
0.3
|
||||
],
|
||||
[
|
||||
1614100440000,
|
||||
0.28
|
||||
],
|
||||
[
|
||||
1614100500000,
|
||||
0.27
|
||||
],
|
||||
[
|
||||
1614100560000,
|
||||
0.26
|
||||
],
|
||||
[
|
||||
1614100620000,
|
||||
0.23
|
||||
],
|
||||
[
|
||||
1614100680000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614100740000,
|
||||
0.3
|
||||
],
|
||||
[
|
||||
1614100800000,
|
||||
0.24
|
||||
],
|
||||
[
|
||||
1614100860000,
|
||||
0.19
|
||||
],
|
||||
[
|
||||
1614100920000,
|
||||
0.16
|
||||
],
|
||||
[
|
||||
1614100980000,
|
||||
0.19
|
||||
],
|
||||
[
|
||||
1614101040000,
|
||||
0.23
|
||||
],
|
||||
[
|
||||
1614101100000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614101160000,
|
||||
0.15
|
||||
],
|
||||
[
|
||||
1614101220000,
|
||||
0.12
|
||||
],
|
||||
[
|
||||
1614101280000,
|
||||
0.1
|
||||
],
|
||||
[
|
||||
1614101340000,
|
||||
0.24
|
||||
],
|
||||
[
|
||||
1614101400000,
|
||||
0.19
|
||||
],
|
||||
[
|
||||
1614101460000,
|
||||
0.16
|
||||
],
|
||||
[
|
||||
1614101520000,
|
||||
0.14
|
||||
],
|
||||
[
|
||||
1614101580000,
|
||||
0.12
|
||||
],
|
||||
[
|
||||
1614101640000,
|
||||
0.14
|
||||
],
|
||||
[
|
||||
1614101700000,
|
||||
0.12
|
||||
],
|
||||
[
|
||||
1614101760000,
|
||||
0.13
|
||||
],
|
||||
[
|
||||
1614101820000,
|
||||
0.12
|
||||
],
|
||||
[
|
||||
1614101880000,
|
||||
0.11
|
||||
],
|
||||
[
|
||||
1614101940000,
|
||||
0.36
|
||||
],
|
||||
[
|
||||
1614102000000,
|
||||
0.35
|
||||
],
|
||||
[
|
||||
1614102060000,
|
||||
0.3
|
||||
],
|
||||
[
|
||||
1614102120000,
|
||||
0.32
|
||||
],
|
||||
[
|
||||
1614102180000,
|
||||
0.27
|
||||
],
|
||||
[
|
||||
1614102240000,
|
||||
0.26
|
||||
],
|
||||
[
|
||||
1614102300000,
|
||||
0.21
|
||||
],
|
||||
[
|
||||
1614102360000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614102420000,
|
||||
0.15
|
||||
],
|
||||
[
|
||||
1614102480000,
|
||||
0.12
|
||||
],
|
||||
[
|
||||
1614102540000,
|
||||
0.24
|
||||
],
|
||||
[
|
||||
1614102600000,
|
||||
0.2
|
||||
],
|
||||
[
|
||||
1614102660000,
|
||||
0.17
|
||||
],
|
||||
[
|
||||
1614102720000,
|
||||
0.18
|
||||
],
|
||||
[
|
||||
1614102780000,
|
||||
0.14
|
||||
],
|
||||
[
|
||||
1614102840000,
|
||||
0.39
|
||||
],
|
||||
[
|
||||
1614102900000,
|
||||
0.31
|
||||
],
|
||||
[
|
||||
1614102960000,
|
||||
0.3
|
||||
],
|
||||
[
|
||||
1614103020000,
|
||||
0.24
|
||||
],
|
||||
[
|
||||
1614103080000,
|
||||
0.26
|
||||
],
|
||||
[
|
||||
1614103140000,
|
||||
0.21
|
||||
],
|
||||
[
|
||||
1614103200000,
|
||||
0.17
|
||||
],
|
||||
[
|
||||
1614103260000,
|
||||
0.15
|
||||
],
|
||||
[
|
||||
1614103320000,
|
||||
0.2
|
||||
],
|
||||
[
|
||||
1614103380000,
|
||||
0.2
|
||||
],
|
||||
[
|
||||
1614103440000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614103500000,
|
||||
0.19
|
||||
],
|
||||
[
|
||||
1614103560000,
|
||||
0.22
|
||||
],
|
||||
[
|
||||
1614103620000,
|
||||
0.29
|
||||
],
|
||||
[
|
||||
1614103680000,
|
||||
0.31
|
||||
],
|
||||
[
|
||||
1614103740000,
|
||||
0.28
|
||||
],
|
||||
[
|
||||
1614103800000,
|
||||
0.23
|
||||
]
|
||||
],
|
||||
"dpsMeta": {
|
||||
"firstTimestamp": 1614099600000,
|
||||
"lastTimestamp": 1614103800000,
|
||||
"setCount": 71,
|
||||
"series": 1
|
||||
},
|
||||
"meta": [
|
||||
{
|
||||
"index": 0,
|
||||
"metrics": [
|
||||
"timestamp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"metrics": [
|
||||
"system.load5"
|
||||
],
|
||||
"commonTags": {
|
||||
"rack": "undef",
|
||||
"host": "use1-mon-metrics-1",
|
||||
"row": "undef",
|
||||
"dc": "us-east-1",
|
||||
"group": "monitoring"
|
||||
},
|
||||
"aggregatedTags": []
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"query": {
|
||||
"name": null,
|
||||
"time": {
|
||||
"start": "1h-ago",
|
||||
"end": null,
|
||||
"timezone": null,
|
||||
"downsampler": {
|
||||
"interval": "1m",
|
||||
"aggregator": "avg",
|
||||
"fillPolicy": {
|
||||
"policy": "nan",
|
||||
"value": "NaN"
|
||||
}
|
||||
},
|
||||
"aggregator": "sum",
|
||||
"rate": false
|
||||
},
|
||||
"filters": [
|
||||
{
|
||||
"id": "f1",
|
||||
"tags": [
|
||||
{
|
||||
"tagk": "host",
|
||||
"filter": "use1-mon-metrics-1",
|
||||
"group_by": true,
|
||||
"type": "literal_or"
|
||||
},
|
||||
{
|
||||
"tagk": "group",
|
||||
"filter": "monitoring",
|
||||
"group_by": true,
|
||||
"type": "literal_or"
|
||||
},
|
||||
{
|
||||
"tagk": "dc",
|
||||
"filter": "us-east-1",
|
||||
"group_by": true,
|
||||
"type": "literal_or"
|
||||
},
|
||||
{
|
||||
"tagk": "rack",
|
||||
"filter": "undef",
|
||||
"group_by": true,
|
||||
"type": "literal_or"
|
||||
},
|
||||
{
|
||||
"tagk": "row",
|
||||
"filter": "undef",
|
||||
"group_by": true,
|
||||
"type": "literal_or"
|
||||
}
|
||||
],
|
||||
"explicitTags": false
|
||||
}
|
||||
],
|
||||
"metrics": [
|
||||
{
|
||||
"metric": "system.load5",
|
||||
"id": "a",
|
||||
"filter": "f1",
|
||||
"aggregator": null,
|
||||
"timeOffset": null,
|
||||
"fillPolicy": {
|
||||
"policy": "nan",
|
||||
"value": "NaN"
|
||||
}
|
||||
}
|
||||
],
|
||||
"expressions": [],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "a",
|
||||
"alias": "query"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
62
app/vmctl/opentsdb/testdata/exampleQuery.json
vendored
Normal file
62
app/vmctl/opentsdb/testdata/exampleQuery.json
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"time": {
|
||||
"start": "1h-ago",
|
||||
"aggregator":"sum",
|
||||
"downsampler": {
|
||||
"interval": "1m",
|
||||
"aggregator": "avg",
|
||||
"fillPolicy": {
|
||||
"policy": "nan"
|
||||
}
|
||||
}
|
||||
},
|
||||
"filters": [
|
||||
{
|
||||
"tags": [
|
||||
{
|
||||
"type": "literal_or",
|
||||
"tagk": "host",
|
||||
"filter": "use1-mon-metrics-1",
|
||||
"groupBy": true
|
||||
},
|
||||
{
|
||||
"type": "literal_or",
|
||||
"tagk": "group",
|
||||
"filter": "monitoring",
|
||||
"groupBy": true
|
||||
},
|
||||
{
|
||||
"type": "literal_or",
|
||||
"tagk": "dc",
|
||||
"filter": "us-east-1",
|
||||
"groupBy": true
|
||||
},
|
||||
{
|
||||
"type": "literal_or",
|
||||
"tagk": "rack",
|
||||
"filter": "undef",
|
||||
"groupBy": true
|
||||
},
|
||||
{
|
||||
"type": "literal_or",
|
||||
"tagk": "row",
|
||||
"filter": "undef",
|
||||
"groupBy": true
|
||||
}
|
||||
],
|
||||
"id": "f1"
|
||||
}
|
||||
],
|
||||
"metrics": [
|
||||
{
|
||||
"id": "a",
|
||||
"metric": "system.load5",
|
||||
"filter": "f1",
|
||||
"fillPolicy":{"policy":"nan"}
|
||||
}
|
||||
],
|
||||
"expressions": [],
|
||||
"outputs":[
|
||||
{"id":"a", "alias":"query"}
|
||||
]
|
||||
}
|
||||
@@ -56,27 +56,37 @@ func (cw *cWriter) printf(format string, args ...interface{}) {
|
||||
|
||||
//"{"metric":{"__name__":"cpu_usage_guest","arch":"x64","hostname":"host_19",},"timestamps":[1567296000000,1567296010000],"values":[1567296000000,66]}
|
||||
func (ts *TimeSeries) write(w io.Writer) (int, error) {
|
||||
pointsCount := len(ts.Timestamps)
|
||||
if pointsCount == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
timestamps := ts.Timestamps
|
||||
values := ts.Values
|
||||
cw := &cWriter{w: w}
|
||||
cw.printf(`{"metric":{"__name__":%q`, ts.Name)
|
||||
if len(ts.LabelPairs) > 0 {
|
||||
for len(timestamps) > 0 {
|
||||
// Split long lines with more than 10K samples into multiple JSON lines.
|
||||
// This should limit memory usage at VictoriaMetrics during data ingestion,
|
||||
// since it allocates memory for the whole JSON line and processes it in one go.
|
||||
batchSize := 10000
|
||||
if batchSize > len(timestamps) {
|
||||
batchSize = len(timestamps)
|
||||
}
|
||||
timestampsBatch := timestamps[:batchSize]
|
||||
valuesBatch := values[:batchSize]
|
||||
timestamps = timestamps[batchSize:]
|
||||
values = values[batchSize:]
|
||||
|
||||
cw.printf(`{"metric":{"__name__":%q`, ts.Name)
|
||||
for _, lp := range ts.LabelPairs {
|
||||
cw.printf(",%q:%q", lp.Name, lp.Value)
|
||||
}
|
||||
}
|
||||
|
||||
cw.printf(`},"timestamps":[`)
|
||||
for i := 0; i < pointsCount-1; i++ {
|
||||
cw.printf(`%d,`, ts.Timestamps[i])
|
||||
pointsCount := len(timestampsBatch)
|
||||
cw.printf(`},"timestamps":[`)
|
||||
for i := 0; i < pointsCount-1; i++ {
|
||||
cw.printf(`%d,`, timestampsBatch[i])
|
||||
}
|
||||
cw.printf(`%d],"values":[`, timestampsBatch[pointsCount-1])
|
||||
for i := 0; i < pointsCount-1; i++ {
|
||||
cw.printf(`%v,`, valuesBatch[i])
|
||||
}
|
||||
cw.printf("%v]}\n", valuesBatch[pointsCount-1])
|
||||
}
|
||||
cw.printf(`%d],"values":[`, ts.Timestamps[pointsCount-1])
|
||||
for i := 0; i < pointsCount-1; i++ {
|
||||
cw.printf(`%v,`, ts.Values[i])
|
||||
}
|
||||
cw.printf("%v]}\n", ts.Values[pointsCount-1])
|
||||
return cw.n, cw.err
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ import (
|
||||
type Config struct {
|
||||
// VictoriaMetrics address to perform import requests
|
||||
// --httpListenAddr value for single node version
|
||||
// --httpListenAddr value of VMSelect component for cluster version
|
||||
// --httpListenAddr value of vmselect component for cluster version
|
||||
Addr string
|
||||
// Concurrency defines number of worker
|
||||
// performing the import requests concurrently
|
||||
@@ -51,7 +51,7 @@ type Config struct {
|
||||
|
||||
// Importer performs insertion of timeseries
|
||||
// via VictoriaMetrics import protocol
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master#how-to-import-time-series-data
|
||||
// see https://docs.victoriametrics.com/#how-to-import-time-series-data
|
||||
type Importer struct {
|
||||
addr string
|
||||
importPath string
|
||||
@@ -105,11 +105,11 @@ func NewImporter(cfg Config) (*Importer, error) {
|
||||
|
||||
addr := strings.TrimRight(cfg.Addr, "/")
|
||||
// if single version
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master#how-to-import-time-series-data
|
||||
// see https://docs.victoriametrics.com/#how-to-import-time-series-data
|
||||
importPath := addr + "/api/v1/import"
|
||||
if cfg.AccountID != "" {
|
||||
// if cluster version
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster#url-format
|
||||
// see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
|
||||
importPath = fmt.Sprintf("%s/insert/%s/prometheus/api/v1/import", addr, cfg.AccountID)
|
||||
}
|
||||
importPath, err := AddExtraLabelsToImportPath(importPath, cfg.ExtraLabels)
|
||||
|
||||
289
app/vmgateway/README.md
Normal file
289
app/vmgateway/README.md
Normal file
@@ -0,0 +1,289 @@
|
||||
# vmgateway
|
||||
|
||||
***vmgateway is a part of [enterprise package](https://victoriametrics.com/enterprise.html)***
|
||||
|
||||
|
||||
<img alt="vmgateway" src="vmgateway-overview.jpeg">
|
||||
|
||||
`vmgateway` is a proxy for the VictoriaMetrics Time Series Database (TSDB). It provides the following features:
|
||||
|
||||
* Rate Limiter
|
||||
* Based on cluster tenant's utilization, it supports multiple time interval limits for both the ingestion and retrieval of metrics
|
||||
* Token Access Control
|
||||
* Supports additional per-label access control for both the Single and Cluster versions of the VictoriaMetrics TSDB
|
||||
* Provides access by tenantID in the Cluster version
|
||||
* Allows for separate write/read/admin access to data
|
||||
|
||||
`vmgateway` is included in our [enterprise packages](https://victoriametrics.com/enterprise.html).
|
||||
|
||||
|
||||
## Access Control
|
||||
|
||||
<img alt="vmgateway-ac" src="vmgateway-access-control.jpg">
|
||||
|
||||
`vmgateway` supports jwt based authentication. With jwt payload can be configured to give access to specific tenants and labels as well as to read/write.
|
||||
|
||||
jwt token must be in following format:
|
||||
```json
|
||||
{
|
||||
"exp": 1617304574,
|
||||
"vm_access": {
|
||||
"tenant_id": {
|
||||
"account_id": 1,
|
||||
"project_id": 5
|
||||
},
|
||||
"extra_labels": {
|
||||
"team": "dev",
|
||||
"project": "mobile"
|
||||
},
|
||||
"mode": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
Where:
|
||||
- `exp` - required, expire time in unix_timestamp. If the token expires then `vmgateway` rejects the request.
|
||||
- `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenand_id": {}}`
|
||||
- `tenant_id` - optional, for cluster mode, routes requests to the corresponding tenant.
|
||||
- `extra_labels` - optional, key-value pairs for label filters added to the ingested or selected metrics.
|
||||
- `mode` - optional, access mode for api - read, write, or full. Supported values: 0 - full (default value), 1 - read, 2 - write.
|
||||
|
||||
## QuickStart
|
||||
|
||||
Start the single version of VictoriaMetrics
|
||||
|
||||
```bash
|
||||
# single
|
||||
# start node
|
||||
./bin/victoria-metrics --selfScrapeInterval=10s
|
||||
```
|
||||
|
||||
Start vmgateway
|
||||
|
||||
```bash
|
||||
./bin/vmgateway -eula -enable.auth -read.url http://localhost:8428 --write.url http://localhost:8428
|
||||
```
|
||||
|
||||
Retrieve data from the database
|
||||
```bash
|
||||
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ2bV9hY2Nlc3MiOnsidGVuYW50X2lkIjp7fSwicm9sZSI6MX0sImV4cCI6MTkzOTM0NjIxMH0.5WUxEfdcV9hKo4CtQdtuZYOGpGXWwaqM9VuVivMMrVg'
|
||||
```
|
||||
|
||||
A request with an incorrect token or without any token will be rejected:
|
||||
```bash
|
||||
curl 'http://localhost:8431/api/v1/series/count'
|
||||
|
||||
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer incorrect-token'
|
||||
```
|
||||
|
||||
|
||||
## Rate Limiter
|
||||
|
||||
<img alt="vmgateway-rl" src="vmgateway-rate-limiting.jpg">
|
||||
|
||||
Limits incoming requests by given, pre-configured limits. It supports read and write limiting by tenant.
|
||||
|
||||
`vmgateway` needs a datasource for rate limit queries. It can be either single-node or cluster version of `victoria-metrics`.
|
||||
The metrics that you want to rate limit must be scraped from the cluster.
|
||||
|
||||
List of supported limit types:
|
||||
- `queries` - count of api requests made at tenant to read the api, such as `/api/v1/query`, `/api/v1/series` and others.
|
||||
- `active_series` - count of current active series at any given tenant.
|
||||
- `new_series` - count of created series; aka churn rate
|
||||
- `rows_inserted` - count of inserted rows per tenant.
|
||||
|
||||
List of supported time windows:
|
||||
- `minute`
|
||||
- `hour`
|
||||
|
||||
Limits can be specified per tenant or at a global level if you omit `project_id` and `account_id`.
|
||||
|
||||
Example of configuration file:
|
||||
|
||||
```yaml
|
||||
limits:
|
||||
- type: queries
|
||||
value: 1000
|
||||
resolution: minute
|
||||
- type: queries
|
||||
value: 10000
|
||||
resolution: hour
|
||||
- type: queries
|
||||
value: 10
|
||||
resolution: minute
|
||||
project_id: 5
|
||||
account_id: 1
|
||||
```
|
||||
|
||||
## QuickStart
|
||||
|
||||
cluster version of VictoriaMetrics is required for rate limiting.
|
||||
```bash
|
||||
# start datasource for cluster metrics
|
||||
|
||||
cat << EOF > cluster.yaml
|
||||
scrape_configs:
|
||||
- job_name: cluster
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['127.0.0.1:8481','127.0.0.1:8482','127.0.0.1:8480']
|
||||
EOF
|
||||
|
||||
./bin/victoria-metrics --promscrape.config cluster.yaml
|
||||
|
||||
# start cluster
|
||||
|
||||
# start vmstorage, vmselect and vminsert
|
||||
./bin/vmstorage -eula
|
||||
./bin/vmselect -eula -storageNode 127.0.0.1:8401
|
||||
./bin/vminsert -eula -storageNode 127.0.0.1:8400
|
||||
|
||||
# create base rate limitng config:
|
||||
cat << EOF > limit.yaml
|
||||
limits:
|
||||
- type: queries
|
||||
value: 100
|
||||
- type: rows_inserted
|
||||
value: 100000
|
||||
- type: new_series
|
||||
value: 1000
|
||||
- type: active_series
|
||||
value: 100000
|
||||
- type: queries
|
||||
value: 1
|
||||
account_id: 15
|
||||
EOF
|
||||
|
||||
# start gateway with clusterMoe
|
||||
./bin/vmgateway -eula -enable.rateLimit -ratelimit.config limit.yaml -datasource.url http://localhost:8428 -enable.auth -clusterMode -write.url=http://localhost:8480 --read.url=http://localhost:8481
|
||||
|
||||
# ingest simple metric to tenant 1:5
|
||||
curl 'http://localhost:8431/api/v1/import/prometheus' -X POST -d 'foo{bar="baz1"} 123' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2MjAxNjIwMDAwMDAsInZtX2FjY2VzcyI6eyJ0ZW5hbnRfaWQiOnsiYWNjb3VudF9pZCI6MTV9fX0.PB1_KXDKPUp-40pxOGk6lt_jt9Yq80PIMpWVJqSForQ'
|
||||
# read metric from tenant 1:5
|
||||
curl 'http://localhost:8431/api/v1/labels' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2MjAxNjIwMDAwMDAsInZtX2FjY2VzcyI6eyJ0ZW5hbnRfaWQiOnsiYWNjb3VudF9pZCI6MTV9fX0.PB1_KXDKPUp-40pxOGk6lt_jt9Yq80PIMpWVJqSForQ'
|
||||
|
||||
# check rate limit
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The shortlist of configuration flags include the following:
|
||||
```console
|
||||
-clusterMode
|
||||
enable this for the cluster version
|
||||
-datasource.appendTypePrefix
|
||||
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
|
||||
-datasource.basicAuth.password string
|
||||
Optional basic auth password for -datasource.url
|
||||
-datasource.basicAuth.username string
|
||||
Optional basic auth username for -datasource.url
|
||||
-datasource.lookback duration
|
||||
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
|
||||
-datasource.maxIdleConnections int
|
||||
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
|
||||
-datasource.queryStep duration
|
||||
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.
|
||||
-datasource.tlsCAFile string
|
||||
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
|
||||
-datasource.tlsCertFile string
|
||||
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
|
||||
-datasource.tlsInsecureSkipVerify
|
||||
Whether to skip tls verification when connecting to -datasource.url
|
||||
-datasource.tlsKeyFile string
|
||||
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
|
||||
-datasource.tlsServerName string
|
||||
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
|
||||
-datasource.url string
|
||||
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
|
||||
-enable.auth
|
||||
enables auth with jwt token
|
||||
-enable.rateLimit
|
||||
enables rate limiter
|
||||
-enableTCP6
|
||||
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
|
||||
-envflag.enable
|
||||
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
|
||||
-envflag.prefix string
|
||||
Prefix for environment variables if -envflag.enable is set
|
||||
-eula
|
||||
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
|
||||
-fs.disableMmap
|
||||
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches as they cannot read data files larger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
|
||||
-http.connTimeout duration
|
||||
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
|
||||
-http.disableResponseCompression
|
||||
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
|
||||
-http.idleConnTimeout duration
|
||||
Timeout for incoming idle http connections (default 1m0s)
|
||||
-http.maxGracefulShutdownDuration duration
|
||||
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
|
||||
-http.pathPrefix string
|
||||
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
|
||||
-http.shutdownDelay duration
|
||||
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
|
||||
-httpAuth.password string
|
||||
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
|
||||
-httpAuth.username string
|
||||
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
|
||||
-httpListenAddr string
|
||||
TCP address to listen for http connections (default ":8431")
|
||||
-loggerDisableTimestamps
|
||||
Whether to disable writing timestamps in logs
|
||||
-loggerErrorsPerSecondLimit int
|
||||
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
|
||||
-loggerFormat string
|
||||
Format for logs. Possible values: default, json (default "default")
|
||||
-loggerLevel string
|
||||
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
|
||||
-loggerOutput string
|
||||
Output for the logs. Supported values: stderr, stdout (default "stderr")
|
||||
-loggerTimezone string
|
||||
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
|
||||
-loggerWarnsPerSecondLimit int
|
||||
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
|
||||
-memory.allowedBytes size
|
||||
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
|
||||
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
|
||||
-memory.allowedPercent float
|
||||
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
|
||||
-metricsAuthKey string
|
||||
Auth key for /metrics. It overrides httpAuth settings
|
||||
-pprofAuthKey string
|
||||
Auth key for /debug/pprof. It overrides httpAuth settings
|
||||
-ratelimit.config string
|
||||
path for configuration file
|
||||
-ratelimit.extraLabels array
|
||||
additional labels, that will be applied to fetchdata from datasource
|
||||
Supports an array of values separated by comma or specified via multiple flags.
|
||||
-ratelimit.refreshInterval duration
|
||||
(default 5s)
|
||||
-read.url string
|
||||
read access url address, example: http://vmselect:8481
|
||||
-tls
|
||||
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
|
||||
-tlsCertFile string
|
||||
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
|
||||
-tlsKeyFile string
|
||||
Path to file with TLS key. Used only if -tls is set
|
||||
-version
|
||||
Show VictoriaMetrics version
|
||||
-write.url string
|
||||
write access url address, example: http://vminsert:8480
|
||||
|
||||
```
|
||||
|
||||
## TroubleShooting
|
||||
|
||||
* Access control:
|
||||
* incorrect `jwt` format, try https://jwt.io/#debugger-io with our tokens
|
||||
* expired token, check `exp` field.
|
||||
* Rate Limiting:
|
||||
* `scrape_interval` at datasource, reduce it to apply limits faster.
|
||||
|
||||
|
||||
## Limitations
|
||||
|
||||
* Access Control:
|
||||
* `jwt` token must be validated by external system, currently `vmgateway` can't validate the signature.
|
||||
* RateLimiting:
|
||||
* limits applied based on queries to `datasource.url`
|
||||
* only cluster version can be rate-limited.
|
||||
BIN
app/vmgateway/vmgateway-access-control.jpg
Normal file
BIN
app/vmgateway/vmgateway-access-control.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
BIN
app/vmgateway/vmgateway-overview.jpeg
Normal file
BIN
app/vmgateway/vmgateway-overview.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
BIN
app/vmgateway/vmgateway-rate-limiting.jpg
Normal file
BIN
app/vmgateway/vmgateway-rate-limiting.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 35 KiB |
BIN
app/vmgateway/vmgateway.png
Normal file
BIN
app/vmgateway/vmgateway.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
@@ -14,7 +14,7 @@ import (
|
||||
|
||||
// InsertCtx contains common bits for data points insertion.
|
||||
type InsertCtx struct {
|
||||
Labels []prompb.Label
|
||||
Labels sortedLabels
|
||||
|
||||
mrs []storage.MetricRow
|
||||
metricNamesBuf []byte
|
||||
|
||||
32
app/vminsert/common/sort_labels.go
Normal file
32
app/vminsert/common/sort_labels.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"sort"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
)
|
||||
|
||||
var sortLabels = flag.Bool("sortLabels", false, `Whether to sort labels for incoming samples before writing them to storage. `+
|
||||
`This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. `+
|
||||
`For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. `+
|
||||
`Enabled sorting for labels can slow down ingestion performance a bit`)
|
||||
|
||||
// SortLabelsIfNeeded sorts labels if -sortLabels command-line flag is set
|
||||
func (ctx *InsertCtx) SortLabelsIfNeeded() {
|
||||
if *sortLabels {
|
||||
sort.Sort(&ctx.Labels)
|
||||
}
|
||||
}
|
||||
|
||||
type sortedLabels []prompb.Label
|
||||
|
||||
func (sl *sortedLabels) Len() int { return len(*sl) }
|
||||
func (sl *sortedLabels) Less(i, j int) bool {
|
||||
a := *sl
|
||||
return string(a[i].Name) < string(a[j].Name)
|
||||
}
|
||||
func (sl *sortedLabels) Swap(i, j int) {
|
||||
a := *sl
|
||||
a[i], a[j] = a[j], a[i]
|
||||
}
|
||||
@@ -55,6 +55,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ctx.SortLabelsIfNeeded()
|
||||
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ func insertRows(rows []parser.Row) error {
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ctx.SortLabelsIfNeeded()
|
||||
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -96,7 +96,8 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
|
||||
if !*skipMeasurement {
|
||||
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
|
||||
}
|
||||
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
|
||||
skipFieldKey := len(r.Measurement) > 0 && len(r.Fields) == 1 && *skipSingleField
|
||||
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
|
||||
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
|
||||
}
|
||||
@@ -116,11 +117,13 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ic.SortLabelsIfNeeded()
|
||||
if err := ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, f.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ic.SortLabelsIfNeeded()
|
||||
ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels)
|
||||
labelsLen := len(ic.Labels)
|
||||
for j := range r.Fields {
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
|
||||
@@ -19,6 +20,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/relabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/vmimport"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/influxutils"
|
||||
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
|
||||
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
|
||||
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
|
||||
@@ -34,7 +36,7 @@ import (
|
||||
var (
|
||||
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
|
||||
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
|
||||
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<victoriametrics>:8428/write`")
|
||||
"This flag isn't needed when ingesting data over HTTP - just send it to http://<victoriametrics>:8428/write")
|
||||
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
|
||||
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
|
||||
"Usually :4242 must be set. Doesn't work if empty")
|
||||
@@ -43,8 +45,8 @@ var (
|
||||
)
|
||||
|
||||
var (
|
||||
influxServer *influxserver.Server
|
||||
graphiteServer *graphiteserver.Server
|
||||
influxServer *influxserver.Server
|
||||
opentsdbServer *opentsdbserver.Server
|
||||
opentsdbhttpServer *opentsdbhttpserver.Server
|
||||
)
|
||||
@@ -55,12 +57,12 @@ func Init() {
|
||||
storage.SetMaxLabelsPerTimeseries(*maxLabelsPerTimeseries)
|
||||
common.StartUnmarshalWorkers()
|
||||
writeconcurrencylimiter.Init()
|
||||
if len(*influxListenAddr) > 0 {
|
||||
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
|
||||
}
|
||||
if len(*graphiteListenAddr) > 0 {
|
||||
graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
|
||||
}
|
||||
if len(*influxListenAddr) > 0 {
|
||||
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
|
||||
}
|
||||
if len(*opentsdbListenAddr) > 0 {
|
||||
opentsdbServer = opentsdbserver.MustStart(*opentsdbListenAddr, opentsdb.InsertHandler, opentsdbhttp.InsertHandler)
|
||||
}
|
||||
@@ -73,12 +75,12 @@ func Init() {
|
||||
// Stop stops vminsert.
|
||||
func Stop() {
|
||||
promscrape.Stop()
|
||||
if len(*influxListenAddr) > 0 {
|
||||
influxServer.MustStop()
|
||||
}
|
||||
if len(*graphiteListenAddr) > 0 {
|
||||
graphiteServer.MustStop()
|
||||
}
|
||||
if len(*influxListenAddr) > 0 {
|
||||
influxServer.MustStop()
|
||||
}
|
||||
if len(*opentsdbListenAddr) > 0 {
|
||||
opentsdbServer.MustStop()
|
||||
}
|
||||
@@ -90,13 +92,16 @@ func Stop() {
|
||||
|
||||
// RequestHandler is a handler for Prometheus remote storage write API
|
||||
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
startTime := time.Now()
|
||||
defer requestDuration.UpdateDuration(startTime)
|
||||
|
||||
path := strings.Replace(r.URL.Path, "//", "/", -1)
|
||||
switch path {
|
||||
case "/prometheus/api/v1/write", "/api/v1/write":
|
||||
prometheusWriteRequests.Inc()
|
||||
if err := promremotewrite.InsertHandler(r); err != nil {
|
||||
prometheusWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -105,7 +110,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
vmimportRequests.Inc()
|
||||
if err := vmimport.InsertHandler(r); err != nil {
|
||||
vmimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -114,7 +119,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
csvimportRequests.Inc()
|
||||
if err := csvimport.InsertHandler(r); err != nil {
|
||||
csvimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -123,7 +128,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
prometheusimportRequests.Inc()
|
||||
if err := prometheusimport.InsertHandler(r); err != nil {
|
||||
prometheusimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -132,7 +137,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
nativeimportRequests.Inc()
|
||||
if err := native.InsertHandler(r); err != nil {
|
||||
nativeimportErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
@@ -141,16 +146,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
influxWriteRequests.Inc()
|
||||
if err := influx.InsertHandlerForHTTP(r); err != nil {
|
||||
influxWriteErrors.Inc()
|
||||
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return true
|
||||
case "/influx/query", "/query":
|
||||
// Emulate fake response for influx query.
|
||||
// This is required for TSBS benchmark.
|
||||
influxQueryRequests.Inc()
|
||||
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
|
||||
influxutils.WriteDatabaseNames(w)
|
||||
return true
|
||||
case "/prometheus/targets", "/targets":
|
||||
promscrapeTargetsRequests.Inc()
|
||||
@@ -184,6 +187,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
}
|
||||
|
||||
var (
|
||||
requestDuration = metrics.NewHistogram(`vminsert_request_duration_seconds`)
|
||||
|
||||
prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
|
||||
prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="promremotewrite"}`)
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ func insertRows(block *parser.Block, extraLabels []prompbmarshal.Label) error {
|
||||
// Skip metric without labels.
|
||||
return nil
|
||||
}
|
||||
ic.SortLabelsIfNeeded()
|
||||
ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels)
|
||||
values := block.Values
|
||||
timestamps := block.Timestamps
|
||||
|
||||
@@ -45,6 +45,7 @@ func insertRows(rows []parser.Row) error {
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ctx.SortLabelsIfNeeded()
|
||||
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -63,6 +63,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ctx.SortLabelsIfNeeded()
|
||||
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -60,6 +60,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
|
||||
// Skip metric without labels.
|
||||
continue
|
||||
}
|
||||
ctx.SortLabelsIfNeeded()
|
||||
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user