Compare commits

...

599 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
61cc13c16f docs/CHANGELOG.md: cut v1.63.0 2021-07-15 14:02:13 +03:00
Aliaksandr Valialkin
b060d8bf53 vendor: make vendor-update 2021-07-15 12:55:40 +03:00
Aliaksandr Valialkin
d472b03e34 lib/storage: make sure the second call to DeduplicateSamples and deduplicateSamplesDuringMerge doesnt change samples 2021-07-15 12:17:45 +03:00
Aliaksandr Valialkin
682662b2ae lib/storage: remove cache directory if it contains reset_cache_on_startup file
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1447
2021-07-13 17:58:51 +03:00
Aliaksandr Valialkin
2b0e3efa5c .github/workflows/wiki.yml: properly copy subdirectories 2021-07-13 17:35:02 +03:00
Aliaksandr Valialkin
8d99e94a52 docs: clarify why spare CPU and RAM resources are needed in capacity planning 2021-07-13 15:48:25 +03:00
Aliaksandr Valialkin
8d0ec47be9 docs/Cluster-VictoriaMetrics.md: clarify the docs about the needed values for -dedup.minScrapeInterval at vmselect during replication when the data is pushed from HA pair 2021-07-13 15:43:02 +03:00
Aliaksandr Valialkin
2df66dad7b lib/httpserver: add is_set label to flag metrics
This label allows determining the set flags with the query `flag{is_set="true"}`
2021-07-13 15:10:13 +03:00
Aliaksandr Valialkin
5bc240bffe vendor: make vendor-update 2021-07-13 14:30:19 +03:00
Aliaksandr Valialkin
244d0fe5d7 deployment/docker: update Go builder from v1.16.5 to v1.16.6
Ths Go release has the following bugfixes: https://github.com/golang/go/issues?q=milestone%3AGo1.16.6+label%3ACherryPickApproved
2021-07-13 14:25:41 +03:00
Aliaksandr Valialkin
a925d5a3e1 app/vmselect/promql: duration handling improvements in MetricsQL queries
- Support durations anywhere in MetricsQL queries. E.g. sum_over_time(m[1h])/1h is equivalent to sum_over_time(m[1h])/3600
- Support durations without suffix. E.g. rate(m[300]) is equivalent to rate(m[5m])
2021-07-12 17:16:41 +03:00
Aliaksandr Valialkin
f9de546139 lib/storage: reset perKeyMisses stats less frequently
This should reduce CPU usage for queries executed with intervals higher than 30 seconds
2021-07-12 14:33:42 +03:00
Aliaksandr Valialkin
4f80b2f230 lib/storage: properly limit the size of storage/date_metricID cache 2021-07-12 14:25:44 +03:00
Aliaksandr Valialkin
f3a5465ece docs/CHANGELOG.md: document the change from bfba4c28a4
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1444
2021-07-12 12:44:06 +03:00
Aliaksandr Valialkin
bfba4c28a4 app/vmalert: accept Prometheus-like durations in interval config option inside group section 2021-07-12 12:35:17 +03:00
Aliaksandr Valialkin
a684408e27 docs: update http://slack.victoriametrics.com to https://slack.victoriametrics.com 2021-07-12 10:57:16 +03:00
Aliaksandr Valialkin
8ca2799478 lib/storage: properly determine when the deduplication is needed in needsDedup
Previously needsDedup() could return true if the de-duplication wasn't needed for the following case:

         d < interval
           /     \
   |        v | v        |
     interval   interval

Now it properly returns false for this case
2021-07-12 10:53:30 +03:00
Aliaksandr Valialkin
f539772ca6 docs: sync with the cluster branch 2021-07-10 12:45:38 +03:00
Aliaksandr Valialkin
8c764e88f0 app/vmui: move source code from https://github.com/VictoriaMetrics/vmui to app/vmui
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1413
2021-07-09 17:15:23 +03:00
Aliaksandr Valialkin
2be340a4c9 docs: clarify what does "workload" mean in capacity planning docs 2021-07-09 12:49:53 +03:00
Aliaksandr Valialkin
c5f0b454f0 app/vmselect: follow-up after aa11ef6d3b
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1413
2021-07-07 17:43:35 +03:00
tony
e9e35a7d6a add vmui for vmselect component (#1431)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-07-07 17:33:02 +03:00
Aliaksandr Valialkin
92b8380fdf docs/Cluster-VictoriaMetrics.md: improve capacity planning recommendations 2021-07-07 16:22:51 +03:00
Aliaksandr Valialkin
1152de4321 vendor: make vendor-update 2021-07-07 16:05:04 +03:00
Aliaksandr Valialkin
7e77980608 vendor: update github.com/VictoriaMetrics/metrics from v1.17.2 to v1.17.3 2021-07-07 16:00:25 +03:00
Aliaksandr Valialkin
6e0553c92e lib/mergeset: cache indexBlock items only on the second request
This should reduce the indexdb/indexBlocks cache size, since it won't contain one-time-wonders items.
2021-07-07 15:23:06 +03:00
Aliaksandr Valialkin
ed944313b0 app/{vminsert,vmselect}: export vminsert_request_duration_seconds and vmselect_request_duration_seconds histograms 2021-07-07 13:25:21 +03:00
Aliaksandr Valialkin
766edbc421 lib/httpserver: print full requestURI in httpserver.Errorf
This should simplify debugging.
2021-07-07 13:09:40 +03:00
Aliaksandr Valialkin
c55a64ba08 docs: clarify capacity planning docs 2021-07-07 12:46:55 +03:00
Aliaksandr Valialkin
e843bd7bd7 lib/storage: do not cache inmemoryBlock entries requested only once (aka one-time-wonder items)
This should reduce the cache size and memory usage for the indexdb/dataBlocks cache
2021-07-07 10:58:51 +03:00
Aliaksandr Valialkin
8b262d4ba7 lib/storage: periodically reset prefetchedMetricIDs cache in order to limit its size under high churn rate 2021-07-07 10:58:51 +03:00
Roman Khavronenko
2f54559c89 alerts: sync alert expression for DiskRunsOutOfSpaceIn3Days with dashboard (#1436) 2021-07-07 10:31:09 +03:00
Aliaksandr Valialkin
a7694092b8 Revert "lib/uint64set: allow reusing bucket16 structs inside uint64set.Set via uint64set.Release method"
This reverts commit 7c6d3981bf.

Reason for revert: high contention at bucket16Pool on systems with big number of CPU cores.
This slows down query processing significantly.
2021-07-06 18:21:35 +03:00
Aliaksandr Valialkin
8aa9bba9bd lib/{mergeset,storage}: switch from sync.Pool to chan-based pool for inmemoryPart objects
This should reduce memory usage on systems with big number of CPU cores,
since every inmemoryPart object occupies at least 64KB of memory and sync.Pool maintains
a separate pool inmemoryPart objects per each CPU core.

Though the new scheme for the pool worsens per-cpu cache locality, this should be amortized
by big sizes of inmemoryPart objects.
2021-07-06 16:28:41 +03:00
Aliaksandr Valialkin
7c6d3981bf lib/uint64set: allow reusing bucket16 structs inside uint64set.Set via uint64set.Release method
This reduces the load on memory allocator in Go runtime in production workload.
2021-07-06 15:35:03 +03:00
Aliaksandr Valialkin
78c9174682 lib/mergeset: increase pool capacity for inmemoryBlock according to collected profiles from production workload
CPU and memory profiles show that the pool capacity for inmemoryBlock objects is too small.
This results in the increased load on memory allocation code in Go runtime.
Increase the pool capacity in order to reduce the load on Go runtime.
2021-07-06 13:41:34 +03:00
Aliaksandr Valialkin
f71e4d1853 lib/mergeset: limit the frequency for flushCallback calls to once per 10 seconds
This should improve hit ratio for tagFiltersCache when big number of new time series are constantly registered
(aka high churn rate). This, in turn, should reduce CPU usage for queries over such time series.
2021-07-06 12:17:17 +03:00
Aliaksandr Valialkin
f3acf065c9 lib/storage: consistency renaming: tagCache -> tagFiltersCache
This improves code readability
2021-07-06 11:03:51 +03:00
Aliaksandr Valialkin
0020b9f904 lib/workingsetcache: properly update stats for requests and cache misses
Previously the stats for cache misses could be improperly counted, because it had inflated cache misses
if the entry was missing in the curr cache, but was existing in the prev cache.

The same applies to cache requests - they were inflated if the entry was missing in the curr cache.
2021-07-06 10:53:32 +03:00
Roman Khavronenko
75f12bfe78 add option to add Copy button for code snippets (#1433)
To add a Copy button wrap code snippet with the following element:
```
<div class="with-copy" markdown="1">

<your-code-snippet>

</div>
```

See the changes to `Kubernetes monitoring with VictoriaMetrics Single` for details.
2021-07-06 08:23:39 +03:00
Aliaksandr Valialkin
4cf47163c1 lib/workingsetcache: fix cache capacity calculations after 4f0003f182 2021-07-05 17:11:57 +03:00
Aliaksandr Valialkin
4f0003f182 lib/workingsetcache: typo fixes after d0c830039d 2021-07-05 15:35:37 +03:00
Aliaksandr Valialkin
d0c830039d lib/storage: tune cache sizes according to production workload 2021-07-05 15:16:11 +03:00
Aliaksandr Valialkin
8f973e34fb lib/workingsetcache: properly switch to whole mode
Previously the switch from `split` to `whole` mode had been performed too early,
e.g. when the current cache size became bigger than 1/4 of the allowed cache size.

Now it is performed when the current cache size becomes bigger than 1/2 of the allowed cache size.

This change can reduce memory usage for data ingestion path when big number of active time series are ingested.
2021-07-05 15:16:11 +03:00
Aliaksandr Valialkin
43103be011 lib/{storage,mergeset}: increase cache timeout for data and index blocks from a minute to two minutes
One minute cache timeout result in slower queries in some production workloads where the interval
between query execution is in the range 1 minute - 2 minutes.
2021-07-05 15:16:11 +03:00
Aliaksandr Valialkin
54b9e1d3cb lib/cgroup: set GOGC to 50 by default if it isn't set
This should reduce memory usage for typical VictoriaMetrics workloads by up to 50%
2021-07-05 15:16:11 +03:00
Roman Khavronenko
bd6b8f7e31 move github-pages docs to the main repo (#1432)
* move github-pages docs to the main repo

* rm github actions for copying docs to VictoriaMetrics/VictoriaMetrics.github.io
2021-07-05 14:34:10 +03:00
Aliaksandr Valialkin
888d62e40c docs/CHANGELOG.md: document the bugfix for vm_merge_need_free_disk_space metric at 9a83e9018d 2021-07-05 12:01:08 +03:00
Aliaksandr Valialkin
6bef227388 docs/Articles.md: add an url to https://medium.com/ibm-garage/monitoring-of-multiple-openshift-clusters-with-victoriametrics-d4f0979e2544 2021-07-05 11:51:37 +03:00
Aliaksandr Valialkin
9a83e9018d lib/storage: properly detect free disk space shortage during data merge
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1373
2021-07-02 17:40:54 +03:00
Aliaksandr Valialkin
f6bb130898 app/{vmselect,vmstorage}: clarify the description for -dedup.minScrapeInterval command-line flag
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1426
2021-07-02 15:05:57 +03:00
Aliaksandr Valialkin
7088f17494 lib/promscrape/discovery/consul: use case-insensitive comparison for service names
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1424
2021-07-02 14:49:27 +03:00
Aliaksandr Valialkin
6e406083f2 lib/promauth: cache the client TLS certificate for up to a second
This should reduce CPU usage when TLS connections are established at a high rate.
2021-07-02 13:21:51 +03:00
Aliaksandr Valialkin
a93da746c0 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.15 to v1.0.16
This should help with proper copying of tls.Config struct after it has been set up in lib/promauth.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1420
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/470
2021-07-02 12:03:32 +03:00
Aliaksandr Valialkin
1e52f11e87 docs/Cluster-VictoriaMetrics.md: typo fix: siplify -> simplify 2021-07-02 10:50:55 +03:00
Aliaksandr Valialkin
5beb099ff9 docs/Cluster-VictoriaMetrics.md: add a chapter describing a toy cluster setup on a single host
While at it, refer to available tools, which can simplify cluster setup
2021-07-02 10:48:56 +03:00
Aliaksandr Valialkin
158c50c0ee lib/promauth: reload TLS certificates from disk on every mTLS connection as Prometheus does
This allows updating client certificates without the need to restart vmagent and/or single-node VictoriaMetrics.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1420
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/470
2021-07-01 15:27:47 +03:00
Aliaksandr Valialkin
02ed4b8b46 docs/CHANGELOG.md: document ae485c2bfd 2021-07-01 11:51:19 +03:00
Aliaksandr Valialkin
c25b839078 lib/workingsetcache: reset the cache mode when the cache is reset
This should reduce memory usage if the working set is reduced after the cache reset.
2021-07-01 11:50:11 +03:00
Nikolay
ae485c2bfd fixes /targets button style (#1423)
* fixes /targets button style
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1422

* updates boostrap version
2021-07-01 11:48:07 +03:00
Aliaksandr Valialkin
00bbe1608b deployment/docker: upgrade alpine image from v3.13.5 to v3.14.0 2021-07-01 10:56:56 +03:00
Roman Khavronenko
a38a6fe8ad dashboard: move panel Disk writes/reads to Resource usage row (#1417)
* dashboard: move panel `Disk writes/reads` to `Resource usage` row

* dashboard: make Stats panel consistent with Cluster dashboard
2021-07-01 05:46:26 +03:00
Aliaksandr Valialkin
c93cee8de8 lib/{mergeset,storage}: reduce the maximum lifetime for cached indexdb and data blocks from 2 minutes to a minute
This should reduce memory usage on a system with high number of active time series and a high churn rate.
One minute is enough for caching the blocks needed for repeated queries (e.g. alerting rules, recording rules and dashboard refreshes).
2021-06-29 19:57:07 +03:00
Aliaksandr Valialkin
fc12484734 lib/mergeset: switch from sync.Pool to a channel for a pool for inmemoryBlock structs
This should reduce memory usage for the pool on systems with big number of CPU cores.

The sync.Pool maintains per-CPU pools, so the total number of objects in the pool
is proportional to the number of available CPU cores. The channel limits the number
of pooled objects by its own capacity. This means smaller number of pooled objects on average.
2021-06-29 19:56:59 +03:00
Aliaksandr Valialkin
9ce211a514 lib/promscrape/discovery/docker: fix golint warning: struct field Id should be ID 2021-06-29 13:12:28 +03:00
Aliaksandr Valialkin
5506cff76e lib/storage: put indexDBName into the key for dateTagFilter cache and for uselessTagFilters cache
This should prevent from stats overwriting when the previous indexdb is queried.
2021-06-29 12:40:05 +03:00
Aliaksandr Valialkin
1b0501a09e lib/promscrape: typo fix in /targets output
The typo has been introduced in fb72a2133f

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1408
2021-06-28 21:26:37 +03:00
Aliaksandr Valialkin
3af2162085 docs/vmagent.md: mention about docker_sd_config support 2021-06-25 20:52:15 +03:00
Aliaksandr Valialkin
008033a374 docs/CHANGELOG.md: cut v1.62.0 2021-06-25 13:29:38 +03:00
Aliaksandr Valialkin
cb5453953f lib/promscrape: split docker and dockerswarm service discovery code bases, since they have very little in common
This is a follow up after c85a5b7fcb
2021-06-25 13:20:20 +03:00
Aliaksandr Valialkin
a69045e440 lib/promscrape: consistently sort service discovery routines
This should simplify further maintenance of the code
2021-06-25 12:10:46 +03:00
Lu Jiajing
c85a5b7fcb Support Docker ServiceDiscovery (#1402)
* add docker discovery

* add test

* add labels test and add scrape work

* remove TODO

* refactor to merge apiConfig and sdConfig

* apply suggestion
2021-06-25 11:42:47 +03:00
Nikolay
434e33da9b adds missing MustStop call to do and http sd (#1404) 2021-06-25 11:39:18 +03:00
Aliaksandr Valialkin
b50e2ec88c vendor: make vendor-update 2021-06-24 17:33:42 +03:00
Aliaksandr Valialkin
4345c07777 docs: consistently put the link to articles and slides about VictoriaMetrics after the links to case studies 2021-06-24 15:37:38 +03:00
Aliaksandr Valialkin
b2fca1ab22 docs/CaseStudies.md: add a case study for DFKI 2021-06-24 15:24:39 +03:00
Aliaksandr Valialkin
906fca9e88 docs/CaseStudies.md: add Groove X case study 2021-06-24 15:04:49 +03:00
Aliaksandr Valialkin
3c3694f72a README.md: add missing linke to Sensedia case study 2021-06-24 14:37:49 +03:00
Aliaksandr Valialkin
d25a161579 docs/CaseStudies.md: add Sensedia case study 2021-06-24 14:36:13 +03:00
Aliaksandr Valialkin
ca42410afd docs/CHANGELOG.md: document the bugfix in increase_pure() function from the commit fb4f758715 2021-06-24 12:05:39 +03:00
Aliaksandr Valialkin
5d64ed73c5 lib/protoparser/clusternative: do not pool unmarshalWork structs, since they can occupy big amounts of memory (more than 100MB per each struct)
This should reduce memory usage for vmstorage under high ingestion rate when the vmstorage runs on a system with big number of CPU cores
2021-06-23 15:46:50 +03:00
Aliaksandr Valialkin
cdfae0117a app/vmselect/promql: return the last timestamp for the max / min value from tmax_over_time() and tmin_over_time() function as most users expect 2021-06-23 14:19:00 +03:00
Aliaksandr Valialkin
70e2852376 docs/CHANGELOG.md: document the bugfix for incorrect stats collection for concurrently executed tag filter
Follow up for c22114c6f0
2021-06-23 14:05:28 +03:00
Aliaksandr Valialkin
94f3e40ab3 app/vminsert/netstorage: sort the -storageNode list passed to vminsert nodes
This should reduce resource usage (CPU, RAM, disk IO) at vmstorage nodes
if the addresses of vmstorage nodes are passed in random order to vminsert nodes.
2021-06-23 14:01:41 +03:00
Aliaksandr Valialkin
c22114c6f0 lib/storage: tune tag filters search logic
Tune the logic according to the logs provided at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338#issuecomment-864293624

The previous logic had a race when multiple concurrent queries execute the same tag filter without prior stats.
This could result in incorrectly stored stats for such tag filter, which then could result in non-optimal sorting of tag filters
for further queries.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338
2021-06-23 13:29:39 +03:00
Aliaksandr Valialkin
e8a5bb92b7 lib/promscrape/discovery/consul: properly pass namespace to Consul watcher
Follow-up for 58a2989fe7
2021-06-22 17:42:41 +03:00
Aliaksandr Valialkin
ac54f34f9e lib/promscrape/discovery/http: follow up after e307bbb29a 2021-06-22 13:40:33 +03:00
Aliaksandr Valialkin
755040a171 lib/promscrape/discovery: support generic auth configs in Consul service discovery in the same way as Prometheus 2.28 does 2021-06-22 13:34:02 +03:00
Aliaksandr Valialkin
59e7755df9 docs/CHANGELOG.md: document the support for Consul namepsace
See 58a2989fe7
2021-06-22 13:34:02 +03:00
Nikolay
e307bbb29a adds http_sd (#1399)
* adds http_sd

* adds X-Prometheus-Refresh-Interval-Seconds header

* Update lib/promscrape/discovery/http/api.go

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-06-22 13:33:37 +03:00
Nikolay
58a2989fe7 adds consul enterprise namespace support (#1400)
* adds consul enterprise namespace support

* Update lib/promscrape/discovery/consul/consul.go

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-06-22 12:49:44 +03:00
Aliaksandr Valialkin
3bc83d3b17 docs/PerTenantStatistic.md: document that the per-tenant statistic is a part of cluster version of VictoriaMetrics 2021-06-22 12:43:01 +03:00
Roman Khavronenko
da8c901fab vmctl: add more context to flags description in vm-native mode (#1395) 2021-06-18 19:20:01 +03:00
Aliaksandr Valialkin
a3262daac0 docs/CHANGELOG.md: typo fixes 2021-06-18 19:14:53 +03:00
Aliaksandr Valialkin
83a4db813e app/vmselect: log slow requests to all the /api/v1/* handlers if their execution time exceeds -search.logSlowQueryDuration 2021-06-18 19:04:42 +03:00
Aliaksandr Valialkin
4cfedc5931 docs/Single-server-VictoriaMetrics.md: mention that it is recommended to use a single scrape_interval across all the scrape targets 2021-06-18 15:39:33 +03:00
Aliaksandr Valialkin
570f36b344 app/vmctl: limit JSON line size by 10K samples (#1394)
This should reduce the maximum memory usage at VictoriaMetrics when importing time series with big number of samples.
2021-06-18 15:26:47 +03:00
Aliaksandr Valialkin
eb1af09a04 docs/Cluster-VictoriaMetrics.md: clarify docs about VictoriaMetrics cluster architecture 2021-06-18 14:36:39 +03:00
Aliaksandr Valialkin
cbd9159a22 docs/CHANGELOG.md: document the reduced disk write IO usage
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338
2021-06-18 14:02:59 +03:00
Aliaksandr Valialkin
fb72a2133f lib/promscrape: show jobs with empty scrape targets on /targets page 2021-06-18 10:53:52 +03:00
Aliaksandr Valialkin
d8ab409418 docs/{vmgateway,vmbackupmanager}: explicitly mention that these components are a part of an enterprise package 2021-06-17 17:19:49 +03:00
Nikolay
6c434b260e fixes DO service discovery labels (#1389)
adds test for digitalocean sd
2021-06-17 15:12:20 +03:00
Aliaksandr Valialkin
dcbc22552f lib/storage: fix infinite loop introduced in aa9b56a046 2021-06-17 14:28:10 +03:00
Aliaksandr Valialkin
aa9b56a046 lib/{mergeset,storage}: reduce the number of fsync calls on data ingestion path on systems with many cpu cores
VictoriaMetrics maintains a buffer per CPU core for the ingested data. These buffers are flushed to disk every second.
These buffers are flushed to disk in parallel starting from the commit 56b6b893ce .
This resulted in increased write disk IO usage on systems with many cpu cores
as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338#issuecomment-863046999 .

This commit merges the per-CPU buffers into bigger in-memory buffers before flushing them to disk.
This should reduce the rate of fsync syscalls and, consequently, the write disk IO on systems with many CPU cores.

This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338
See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1244
2021-06-17 13:52:08 +03:00
Aliaksandr Valialkin
12a83d25bf app/vmagent/remotewrite: go fmt after 0a796f7c3a 2021-06-17 13:52:06 +03:00
Aliaksandr Valialkin
9eb3fc346f docs/vmagent.md: sync with app/vmagent/README.md via make docs-sync 2021-06-16 12:36:49 +03:00
Aliaksandr Valialkin
6d17a4e12d docs/CHANGELOG.md: document the changed -remoteWrite.queues value
This is a follow-up for 0a796f7c3a

See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1385
2021-06-16 12:35:46 +03:00
Zongyang
0a796f7c3a Change default value of '-remoteWrite.queues' to cgroup.AvailableCPUs * 2 (#1385)
* Change default value of '-remoteWrite.queues' to cgroup.AvailableCPUS() * 2 to reduce scrape interval

Default value of vmagent option '-remotewrite.queues' is 4 and default
size of vmagent ScheudleUnmarshalWorkers is number of CPUs, when available
CPUs is much greater than 4, e.g 32, worker are competing push queues
which will increase scrape interval and may cause scrape timeout.

* Update README and flag description

Co-authored-by: xiaozy <xiaozy01@fenbi.com>
2021-06-16 12:16:44 +03:00
Roman Khavronenko
fb4f758715 promql: fix increase_pure calculation for cases with stale series (#1381)
Due to staleness handling, increase_pure were using incorrect previous value
during calculation in cases where series disappears for period longer
than staleness period and then returns back. The fix suppose to account
for a real datapoint value before staleness takes place. The fix should
remove unexpected spikes while using `increase_pure` for staled series.
2021-06-15 17:37:19 +03:00
Aliaksandr Valialkin
6a8369f0fc docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics works great with APM workloads (aka Application Performance Monitoring) 2021-06-15 17:32:52 +03:00
Aliaksandr Valialkin
84fb59b0ba lib/storage: move deletedMetricIDs set from indexDB to Storage
This makes consitent the list of deleted metricIDs when it is used from both the current indexDB and the previous indexDB (aka extDB).
This should fix the issue, which could lead to storing new samples under deleted metricIDs after indexDB rotation.
See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136 .

Thanks to @tangqipengleoo for the initial analysis and the pull request - https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1383 .

This commit resolves the issue in more generic way compared to https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1383 .

The downside of the commit is the deletedMetricIDs set isn't cleaned from the metricIDs outside the retention. It needs app restart.
This should be OK in most cases.
2021-06-15 15:04:30 +03:00
Aliaksandr Valialkin
e028ad241a lib/protoparser: stop reading the input stream as soon as the callback provided by the caller returns error
This is a follow-up for af90c3c43b
2021-06-14 15:18:49 +03:00
faceair
af90c3c43b lib/protoparser: stop read when callback error (#1380) 2021-06-14 15:10:58 +03:00
Aliaksandr Valialkin
36d55bff66 lib/promscrape: show the number of samples collected during the last scrape at /targets and /api/v1/targets pages
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377
2021-06-14 14:04:00 +03:00
Aliaksandr Valialkin
7b283ee91c vendor: update github.com/klauspost/compress from v1.13.0 to v1.13.1 2021-06-14 13:42:46 +03:00
Roman Khavronenko
a90012ef26 dashboard: bump version requirements (#1378) 2021-06-14 13:31:59 +03:00
Aliaksandr Valialkin
05bc9667c1 docs/CHANGELOG.md: document the addition of DigitalOcean service discovery
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367
2021-06-14 13:18:19 +03:00
Nikolay
729c4eeb9c adds digital ocean sd (#1376)
* adds digital ocean sd config

* adds digital ocean sd
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367

* typo fix
2021-06-14 13:15:04 +03:00
Roman Khavronenko
b8526e88d3 Dashboard single (#1374)
* dashboard: update single version dash

The update contains the following changes:
* display anonymous memory usage metric. This metric suppose to reflect
memory usage of the process which can't be freed by OS;
* add legends to all panels. This is important for cases when users share
the screenshots;
* modify panels for Grafana v8.0.0

* dashboard: update single version dash tags

* dashboard: update vmagent dash

The update contains the following changes:
* display anonymous memory usage metric. This metric suppose to reflect
memory usage of the process which can't be freed by OS;
* add legends to all panels. This is important for cases when users share
the screenshots;
* modify panels for Grafana v8.0.0
2021-06-14 13:03:23 +03:00
Aliaksandr Valialkin
06b8e7d148 lib/promscrape: increase the duration for reading the full response in stream parsing mode
Increase the duration from 10x to 30x of the configured `scrape_interval'.

This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365
2021-06-14 12:28:09 +03:00
Aliaksandr Valialkin
48210130ac lib/protoparser: measure the duration for reading the whole block of data instead of a single read operation
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365
2021-06-14 12:25:52 +03:00
Aliaksandr Valialkin
3e5b6bae66 docs/Cluster-VictoriaMetrics.md: add lists for command-line flags for cluster components 2021-06-14 12:22:05 +03:00
Aliaksandr Valialkin
3c4366806c lib/protoparser/common: log the duration for reading a block of data in ReadLinesBlockExt on error
This may help debugging issues like https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365
2021-06-14 12:22:04 +03:00
Aliaksandr Valialkin
8a519f1518 docs/vmalert.md: follow-up after 6d5a8c28cd 2021-06-14 11:37:26 +03:00
Roman Khavronenko
6d5a8c28cd Vmalert docs (#1372)
* vmalert: mention what happens if `for` is set to 0 or omitted

* vmalert: add more context to docs
2021-06-11 13:25:53 +03:00
Aliaksandr Valialkin
2ef2e3d6f8 docs/CHANGELOG.md: cut v1.61.1 2021-06-11 13:01:31 +03:00
Aliaksandr Valialkin
ed83558646 app/vmauth: properly handle http.ErrAbortHandler panic
This panic can be raised by the reverseProxy on aborted request to the backend.
So handle it (e.g. suppress) at reverseProxy.ServeHTTP call.

Do not suppress the panic at lib/httpserver generic HTTP handler,
since it may result in an inconsistent state left after the panicking handler.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1353
2021-06-11 12:50:25 +03:00
Aliaksandr Valialkin
c4f3fbfa5d lib/storage: reset cache on disk during series deletion and during indexdb rotation
This should prevent from inconsistent behavior (aka partially missing data for some time series) after unclean shutdown.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347
2021-06-11 12:42:28 +03:00
Aliaksandr Valialkin
d979e14da2 docs/CHANGELOG.md: document the bugfix from 7adfe878e1 2021-06-11 11:28:10 +03:00
Roman Khavronenko
7adfe878e1 vmalert: fix mistake with object reuse while parsing response (#1370)
* vmalert: fix mistake with object reuse while parsing response

During the refactoring, the wrong optimisations was applied in
parse function which caused metric fields reset. The change removes
optimisation.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1369

* vmalert: add test to cover multiple metrics in one response
2021-06-11 11:22:05 +03:00
Aliaksandr Valialkin
69b1482bdb lib/storage: consistency renaming: getMaxRawRowsPerPartition -> getMaxRawRowsPerShard 2021-06-11 10:57:23 +03:00
Aliaksandr Valialkin
044ab46824 lib/storage: reduce the amounts of memory which can be occupied by rawRow items during data ingestion on a system with many CPU cores 2021-06-11 10:57:23 +03:00
John Belmonte
67b17cdd68 spelling fix: synonym (#1363) 2021-06-10 08:32:52 +03:00
Aliaksandr Valialkin
b4008c1e65 vendor: make vendor-update 2021-06-09 20:40:50 +03:00
Aliaksandr Valialkin
b83a51366e docs/CHANGELOG.md: cut v1.61.0 2021-06-09 19:04:44 +03:00
Aliaksandr Valialkin
10d105ee25 docs/FAQ.md: add a chapter comparing VictoriaMetrics to QuestDB 2021-06-09 19:03:23 +03:00
Aliaksandr Valialkin
d0dca62026 app/vmselect/promql: typo fix in the comment 2021-06-09 18:34:31 +03:00
Aliaksandr Valialkin
46d2c7d640 docs/Articles.md: update the broken link to https://nordicapis.com/api-monitoring-with-prometheus-grafana-alertmanager-and-victoriametrics/ 2021-06-09 16:39:56 +03:00
Aliaksandr Valialkin
2422b5091f app/vmauth: improve readability for a config with multiple src_paths 2021-06-09 15:39:32 +03:00
Aliaksandr Valialkin
329b6cd146 docs/CHANGELOG.md: document the enterprise bugfix for the target property in Graphite Render API 2021-06-09 13:50:52 +03:00
Aliaksandr Valialkin
db1c548cb4 docs/CHANGELOG.md: document improvements in re-routing handling in vminsert
See the following commits:

* 1c09e71f5b
* 0d067eb112
* 2c6b917749
2021-06-09 13:40:37 +03:00
Aliaksandr Valialkin
f93a31b490 docs/vmagent.md: mention that vmagent supports scrape targets sharding 2021-06-09 12:29:34 +03:00
Aliaksandr Valialkin
ab15bf8c90 docs: document rules replay feature for vmalert
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/836

This is a follow-up for 2a259ef5e7
2021-06-09 12:27:34 +03:00
Roman Khavronenko
2a259ef5e7 vmalert: support rules backfilling (aka replay) (#1358)
* vmalert: support rules backfilling (aka `replay`)

vmalert can `replay` configured rules in the past
and backfill results via remote write protocol.
It supports MetricsQL/PromQL storage as data source,
and can backfill data to remote write compatible
storage.

Supports recording and alerting rules `replay`. See more
details in README.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/836

* vmalert: review fixes

* vmalert: readme fixes
2021-06-09 12:20:38 +03:00
dependabot[bot]
408fc90b40 build(deps): bump codecov/codecov-action from 1.5.0 to 1.5.2 (#1362)
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 1.5.0 to 1.5.2.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/master/CHANGELOG.md)
- [Commits](https://github.com/codecov/codecov-action/compare/v1.5.0...v1.5.2)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-06-09 12:16:46 +03:00
Roman Khavronenko
5e9f3777bf alerts: add new alert LabelsLimitExceededOnIngestion (#1359) 2021-06-09 12:15:36 +03:00
Aliaksandr Valialkin
c16edf8287 docs/CHANGELOG.md: document the bugfix, which prevents panics for aborted http requests in vmauth
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1353

This is a follow-up for 6b29b955c0
2021-06-09 12:11:53 +03:00
Nikolay
6b29b955c0 disables panic for net/httpAbortHandler (#1355) 2021-06-09 12:08:58 +03:00
Aliaksandr Valialkin
28c44ef065 deployment/docker/docker-compose.yml: update Grafana from v7.5.2 to v8.0.0
See https://github.com/grafana/grafana/releases/tag/v8.0.0
2021-06-09 02:25:24 +03:00
k1rk
668165f53d rename serviceHealth group name to vm-health (#1360)
this causes conflicts in `victoria-metrics-k8s-stack` chart =)
2021-06-08 23:34:38 +03:00
Aliaksandr Valialkin
c0feafefc8 vendor: make vendor-update 2021-06-08 15:49:30 +03:00
Aliaksandr Valialkin
8a7e6ad5cc deployment/docker: update Go builder from v1.16.4 to v1.16.5
See the fixed isses at https://github.com/golang/go/issues?q=milestone%3AGo1.16.5+label%3ACherryPickApproved
2021-06-08 15:45:44 +03:00
Aliaksandr Valialkin
645e18dd88 vendor: update github.com/klauspost/compress from v1.12.3 to v1.13.0 2021-06-08 15:45:43 +03:00
Aliaksandr Valialkin
96b691a0ab lib/storage: properly account the number of loops spent when matching for or suffixes
This may help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338
2021-06-08 13:06:12 +03:00
Aliaksandr Valialkin
661d2668f8 lib/promrelabel: add tests for labelsToString() function 2021-06-04 20:42:46 +03:00
Aliaksandr Valialkin
78f83dc5ad app/{vmagent,vminsert}: follow-up after 2fe045e2a4
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1343
2021-06-04 20:27:58 +03:00
jelmd
2fe045e2a4 new feature: debug relabeling (#1344)
* new feature: relabel logging

Use scrape_configs[x].relabel_debug = true to log metric names inkl.
labels before and after relabeling. After relabeling related metrics
get dropped, i.e. not submitted to servers.

* vminsert wants relabel logging, too.
2021-06-04 17:50:23 +03:00
Aliaksandr Valialkin
5ac25d2585 docs/CHANGELOG.md: document the bugfix from 6f19bb23a1 2021-06-04 11:53:00 +03:00
Hason Chan
6f19bb23a1 fix eureka_sd_configs HTTPClientConfig incorrect parsing (#1350) 2021-06-04 11:47:17 +03:00
Aliaksandr Valialkin
f963f04d3d app/vminsert: add -disableRerouting command-line flag for disabling re-routing if some vmstorage nodes have lower performance than the others 2021-06-04 04:42:01 +03:00
Aliaksandr Valialkin
2d8bd41f8a lib/storage: reduce memory allocations when syncing dateMetricIDCache 2021-06-03 16:20:42 +03:00
Aliaksandr Valialkin
d2d746c4fc docs/CHANGELOG.md: document that it is possible to build VictoriaMetrics components for Solaris
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1322

This is a follow-up for ddc8022702
2021-05-31 09:33:29 +03:00
Nikolay
ddc8022702 fixes solaris build (#1345) 2021-05-31 09:21:23 +03:00
Aliaksandr Valialkin
de8e4b6223 vendor: update github.com/VictoriaMetrics/fastcache from v1.5.8 to v1.6.0 2021-05-31 09:14:25 +03:00
Aliaksandr Valialkin
b22e380a34 app/vmauth: allow balancing the load among multiple backend nodes by specifying multiple urls in url_prefix config 2021-05-29 01:03:37 +03:00
Aliaksandr Valialkin
ffa4cd6fa5 docs/Articles.md: add a link to https://www.percona.com/blog/2021/05/26/compiling-a-percona-monitoring-and-management-v2-client-in-arm-raspberry-pi-3/ 2021-05-28 14:34:31 +03:00
Aliaksandr Valialkin
9315e3ded3 vendor: make vendor-update 2021-05-28 13:18:37 +03:00
Aliaksandr Valialkin
cecbdee00d docs/MetricsQL.md: add a link to technical details about rate() and increase() calculations in Prometheus and VictoriaMetrics 2021-05-28 13:13:50 +03:00
Aliaksandr Valialkin
634eeac804 docs/Single-server-VictoriaMetrics.md: remove misleading wording about querying Graphite metrics with MetricsQL 2021-05-28 02:39:14 +03:00
Aliaksandr Valialkin
a52a20659a lib/promscrape: fix tests after f0c21b6300 2021-05-28 01:32:50 +03:00
Aliaksandr Valialkin
d088923aef Revert "lib/mergeset: remove a pool for inmemoryBlock structs"
This reverts commit 793fe39921.

Reason to revert: production testing revealed possible slowdown when registering big number of new time series
2021-05-28 01:09:32 +03:00
Aliaksandr Valialkin
793fe39921 lib/mergeset: remove a pool for inmemoryBlock structs
The pool for inmemoryBlock struct doesn't give any performance gains in production workloads,
while it may result in excess memory usage for inmemoryBlock structs inside the pool during
background merge of indexdb.
2021-05-27 21:57:33 +03:00
Aliaksandr Valialkin
60341722d5 docs: document f0c21b6300 2021-05-27 15:03:30 +03:00
faceair
f0c21b6300 lib/promscrape: apply body size & sample limit to stream parse (#1331)
* lib/promscrape: apply body size limit to stream parse

Signed-off-by: faceair <git@faceair.me>

* lib/promscrape: apply sample limit to stream parse

Signed-off-by: faceair <git@faceair.me>
2021-05-27 14:52:44 +03:00
Aliaksandr Valialkin
eda6ee40b6 docs: make docs-sync after 2bbb1cc7c1 2021-05-26 12:32:16 +03:00
Roman Khavronenko
2bbb1cc7c1 Docs review (#1330)
* re-order components by prioritizing Cluster-VictoriaMetrics.md

* drop Home.md since it just duplicates other links
2021-05-26 12:28:58 +03:00
Roman Khavronenko
7ecaa2fe2c update the issue template (#1329)
The main changes are:
* ask for Grafana's dashboard screenshots;
* ask only for non-default cmd-line flags;
* explicitly ask about logs;
2021-05-26 12:26:45 +03:00
Aliaksandr Valialkin
7f531e3a60 docs/CHANGELOG.md: document changes from 2233d6ed8a and d210958fd0 2021-05-26 12:23:22 +03:00
Roman Khavronenko
d210958fd0 vmalert: automatically reload configuration on file change (#1326)
New flag `-rule.configCheckInterval` defines how often `vmalert` will re-read
config file. If it detects any changes, the config will be reloaded.
This behaviour is turned off by default.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/512
2021-05-25 14:27:22 +01:00
Aliaksandr Valialkin
2233d6ed8a lib/uint64set: store pointers to bucket16 instead of bucket16 objects in bucket32
This speeds up bucket32.addBucketAtPos() when bucket32.buckets contains big number of items,
since the copying of bucket16 pointers is much faster than the copying of bucket16 objects.

This is a cpu profile for copying bucket16 objects:

      10ms     13.43s (flat, cum) 32.01% of Total
      10ms      120ms    650:	b.b16his = append(b.b16his[:pos+1], b.b16his[pos:]...)
         .          .    651:	b.b16his[pos] = hi
         .     13.31s    652:	b.buckets = append(b.buckets[:pos+1], b.buckets[pos:]...)
         .          .    653:	b16 := &b.buckets[pos]
         .          .    654:	*b16 = bucket16{}
         .          .    655:	return b16
         .          .    656:}

This is a cpu profile for copying pointers to bucket16:

      10ms      1.14s (flat, cum)  2.19% of Total
         .      100ms    647:	b.b16his = append(b.b16his[:pos+1], b.b16his[pos:]...)
         .          .    648:	b.b16his[pos] = hi
      10ms      700ms    649:	b.buckets = append(b.buckets[:pos+1], b.buckets[pos:]...)
         .      330ms    650:	b16 := &bucket16{}
         .          .    651:	b.buckets[pos] = b16
         .          .    652:	return b16
         .          .    653:}
2021-05-25 14:20:52 +03:00
Aliaksandr Valialkin
fd264477bf snap: update Go builder from Go 1.15 to Go 1.16 2021-05-25 12:12:37 +03:00
Dan Fredell
c78d7dde92 Fix quote difference on label_move example (#1321)
Fix quote difference on label_move example
2021-05-24 23:20:38 +03:00
Aliaksandr Valialkin
08234aa7a0 docs/CHANGELOG.md: cut v1.60.0 2021-05-24 15:55:08 +03:00
Aliaksandr Valialkin
a47d4927d2 docs/Single-server-VictoriaMetrics.md: clarify that the storage size depends on the number of samples per series 2021-05-24 15:47:44 +03:00
Aliaksandr Valialkin
b1e8d92577 docs/vmalert.md: sync with app/vmalert/README.md via make docs-sync 2021-05-24 15:47:20 +03:00
Aliaksandr Valialkin
8e7d1f8824 app/vmagent/remotewrite: use WARN level instead of ERROR level for couldnt send a block with size ... bytes to ... log message
This is really warning, since vmagent re-tries sending the data block until success.
2021-05-24 15:43:59 +03:00
Aliaksandr Valialkin
39ef1e7a51 lib/storage: do not stop data ingestion on the first error in Storage.AddRows
Continue data ingestion for the rest of blocks.
2021-05-24 15:32:47 +03:00
Aliaksandr Valialkin
4b01c9fb2e lib/storage: limit the number of rows per each block in Storage.AddRows()
This should reduce memory usage when ingesting big blocks or rows.
2021-05-24 15:24:07 +03:00
Aliaksandr Valialkin
a4ff4b8e65 lib/storage: allow filling all the rows up to their capacity in rawRowsShard.addRows
This should reduce memory usage a bit on data ingestion path
2021-05-24 15:22:59 +03:00
Aliaksandr Valialkin
a46551245c lib/bloomfilter: fix TestLimiterConcurrent 2021-05-24 05:17:36 +03:00
Aliaksandr Valialkin
eafed5335d vendor: update github.com/valyala/gozstd from v1.10.0 to v1.11.0 2021-05-24 04:59:55 +03:00
Aliaksandr Valialkin
93d81b486d lib/fs: do not pass done callback to tryRemoveAll() func
This improves code readability a bit.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1313
2021-05-24 04:51:57 +03:00
Aliaksandr Valialkin
f54133b200 lib/storage: do not populate MetricID->MetricName cache during data ingestion
This cache isn't needed during data ingestion, so there is no need in spending RAM on it.

This reduces RAM usage on data ingestion path by 30%
2021-05-24 03:02:46 +03:00
Aliaksandr Valialkin
24858820b5 docs/CHANGELOG.md: small typo fix 2021-05-23 14:15:01 +03:00
Aliaksandr Valialkin
04eb37a590 docs/CHANGELOG.md: document the addition of extra_filter_labels at 84cc0513e1 2021-05-23 14:10:33 +03:00
Aliaksandr Valialkin
ec79abc382 lib/{mergeset,storage}: reduce the number of IFNO log messages like merged ... items across ... blocks in ... seconds
Log these messages if the merge takes more than 30 seconds instead of 10 seconds.
2021-05-23 14:03:21 +03:00
Roman Khavronenko
84cc0513e1 vmalert: support extra_filter_labels setting per-group (#1319)
The new setting `extra_filter_labels` may be assigned to group.
If it is, then all rules within a group will automatically filter
for configured labels. The feature is well-described here
https://docs.victoriametrics.com#prometheus-querying-api-enhancements

New setting is compatible only with VM datasource.
2021-05-23 00:26:01 +03:00
Aliaksandr Valialkin
78dddfb98f lib/promauth: follow-up after 5b8176c68e 2021-05-22 18:01:11 +03:00
Nikolay
5b8176c68e basic OAuth2 support for remoteWrite and scrape targets (#1316)
* adds OAuth2 support for remoteWrite and scrapping

* adds tests
changes init
2021-05-22 16:20:18 +03:00
Aliaksandr Valialkin
e05dd475f0 lib/fs: concurrently remove up to 1024 blocked NFS directories
Previously the blocked directories were removed sequentially by a single goroutine.
This can be not enough for highly loaded VictoriaMetrics that accepts millions of sample per second,
when big number of LSM parts are created and removed at high rate.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1313
2021-05-21 17:57:46 +03:00
Aliaksandr Valialkin
8e2985b53d lib/fs: wait for a while before giving up on NFS file removal if the removal queue is full
This should reduce the probability of the panic on a highly loaded VictoriaMetrics
accepting millions of samples per second.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1313
2021-05-21 17:21:00 +03:00
Aliaksandr Valialkin
d173a9348c docs/MetricsQL.md: add a link to a list of supported timezones that can be passed to timezone_offset() function
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1306
2021-05-21 16:55:24 +03:00
Aliaksandr Valialkin
14ec2b9f26 docs/CHANGELOG.md: mention the bugfix from d626c5c2a9
Updates https://github.com/VictoriaMetrics/operator/issues/243
2021-05-21 16:37:14 +03:00
Aliaksandr Valialkin
c54bb73867 all: do not skip SIGHUP signal during service initialization
This can lead to stale or incomplete configs like in the https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-21 16:34:06 +03:00
Nikolay
d626c5c2a9 changes vmalert query function (#1307)
* changes vmalert query function
for prometheus rules compatibility its better to use labels as map.
it simplifies template evaluation and allow to ignore can't evaluate field error
because map will return default value.
fixes https://github.com/VictoriaMetrics/operator/issues/243
2021-05-21 13:55:43 +03:00
Aliaksandr Valialkin
060664e221 docs/FAQ.md: re-order questions to be more attractive to visitors 2021-05-20 19:49:34 +03:00
Aliaksandr Valialkin
49ecbc765d app/vmauth: add ability to protect /-/reload endpoint with authKey 2021-05-20 18:47:01 +03:00
Aliaksandr Valialkin
362a49bdd1 vendor: make vendor-update 2021-05-20 18:46:26 +03:00
Aliaksandr Valialkin
4c7bb75fa2 Makefile: update golangci-lint from v1.29.0 to v1.40.1 2021-05-20 18:27:10 +03:00
Aliaksandr Valialkin
0d3e78b9ee docs/CHANGELOG.md: move tip to proper place 2021-05-20 17:56:03 +03:00
Aliaksandr Valialkin
480087944a docs/FAQ.md: add a question on how to run VictoriaMetrics on FreeBSD
The question has been extracted from https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1284
2021-05-20 16:16:35 +03:00
Aliaksandr Valialkin
49c39ab388 docs/FAQ.md: add can I use VictoriaMetrics instead of Prometheus?
The question has been extracted from https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1284
2021-05-20 16:10:36 +03:00
Aliaksandr Valialkin
6cc6a032cd docs/FAQ.md: add a question about memory limits for VictoriaMetrics components
The question has been extracted from https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1284
2021-05-20 16:09:41 +03:00
Aliaksandr Valialkin
d06aae9454 docs/FAQ.md: add a question about multi-tenancy
The question has been extracted from https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1284
2021-05-20 15:52:40 +03:00
Aliaksandr Valialkin
e394ff6466 app/vmagent/remotewrite: expose metrics with the current number of active series per day and per hour
These numbers are exposed via the following metrics:

- vmagent_hourly_series_limit_current_series
- vmagent_daily_series_limit_current_series

Expose also the limits via the following metrics:

- vmagent_hourly_series_limit_max_series
- vmagent_daily_series_limit_max_series
2021-05-20 15:28:09 +03:00
Aliaksandr Valialkin
ad73f226ff app/vmstorage: add ability to limit series cardinality via -storage.maxHourlySeries and -storage.maxDailySeries command-line flags 2021-05-20 14:15:19 +03:00
Aliaksandr Valialkin
7e526effaa app/vmagent: add ability to limit series cardinality on a per-hour and per-day basis 2021-05-20 13:13:40 +03:00
Aliaksandr Valialkin
3cd8606abd docs/CHANGELOG.md: document the bugfix in vmctl import for InfluxDB lines with identical names for field and tag
See dcf8803bbd

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1299
2021-05-20 12:06:16 +03:00
Aliaksandr Valialkin
98e425ee09 docs/CHANGELOG.md: refer to the issue related to timezone_offset() function 2021-05-20 12:03:39 +03:00
Roman Khavronenko
dcf8803bbd vmctl: explicitly set ::tag type for labels selector in influx mode (#1310)
The `::tag` type is needed in cases when field and tag names are equal, which
results into unexpected results in InfluxQL. Setting the type explicitly helps
InfluxDB to understand which exact column we apply filter to.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1299
2021-05-20 12:03:16 +03:00
Aliaksandr Valialkin
22585531ad lib/promscrape/discovery/kubernetes: make golangci-lint happy by removing empty branches 2021-05-20 12:00:29 +03:00
Aliaksandr Valialkin
0842bb9294 app/vmselect/promql: add timezone_offset(tz) function
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1306
2021-05-20 11:53:09 +03:00
Aliaksandr Valialkin
009e136d88 lib/storage: remove possible data race when logging dropped labels 2021-05-20 02:47:22 +03:00
Aliaksandr Valialkin
f7e73fbe8b app/vmagent/remotewrite: sort labels before sending the series to per-remoteWrite.url queues 2021-05-20 02:12:36 +03:00
Aliaksandr Valialkin
eb8093ca6b lib/promscrape/discovery/kubernetes: reload objects on object parse error
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-18 23:25:48 +03:00
Neo He
e8a6c6927d app/{vmbackup,vmrestore},docs/vmrestore.md: typo fix: vbackup -> vmbackup (#1305) 2021-05-18 16:37:28 +03:00
Aliaksandr Valialkin
f4719889da lib/httpserver: typo fix in -http.shutdownDelay command-line flag description: servier -> server 2021-05-18 16:26:16 +03:00
Aliaksandr Valialkin
b30925738b docs/vmalert.md: document multitenant support
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/740
2021-05-18 16:26:14 +03:00
Aliaksandr Valialkin
0cf1fe19e6 lib/promscrape/discovery/kubernetes: simplify the reload logic for urlWatcher.objectsByKey 2021-05-18 15:40:46 +03:00
Aliaksandr Valialkin
bfb61de606 lib/promscrape/discovery/kubernetes: properly update vm_promscrape_discovery_kubernetes_scrape_works metric
Previously it wasn't descreased during config update.
2021-05-18 15:33:45 +03:00
Aliaksandr Valialkin
3166994244 lib/promscrape/discovery/kubernetes: log errors and stop service discovery when unexpected updates are received from Kubernetes API server
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-18 15:10:48 +03:00
Aliaksandr Valialkin
66aba00549 app/vmauth: reload -auth.config on the request to /-/reload
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1194
2021-05-18 02:23:55 +03:00
Aliaksandr Valialkin
3339ea41e7 docs/vmbackup.md: typo fix: snaphosts -> snapshots
Thanks to @jelmd - see 1ab27582a3 (r50884395)
2021-05-18 01:12:36 +03:00
Aliaksandr Valialkin
6c944b86d8 docs: dealay -> delay
Thanks to @jelmd . See 0b7e3510c8 (r50884991)
2021-05-18 01:07:52 +03:00
Aliaksandr Valialkin
ec6a284978 lib/promrelabel: add tests for conditional removal of label on another label match
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1294
2021-05-18 00:23:03 +03:00
Aliaksandr Valialkin
2eed410466 lib/promscrape/discovery/kubernetes: key ScrapeWork objects by urlWatcher instead of namespace
This makes the code less fragile if urlWatcher would depend on additional to namepsace properties.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1170
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-17 23:47:08 +03:00
Aliaksandr Valialkin
ede2ba5a45 docs/CHANGELOG.md: document b38edec7ee
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1293
2021-05-17 01:57:34 +03:00
Aliaksandr Valialkin
55ed77760b vendor: make vendor-update 2021-05-17 01:47:33 +03:00
Aliaksandr Valialkin
2ec6f81b10 vendor: update github.com/valyala/gozstd from v1.9.0 to v1.10.0 2021-05-17 01:47:33 +03:00
Roman Khavronenko
b96b19f040 Docs update from victoriaMetrics.github.io (#1302)
* port change from 11ca65677b

* port change from afb41dfa43

* port change from f82e3733c9

* port change from d499ab0502
2021-05-16 18:43:09 +01:00
Roman Khavronenko
b38edec7ee vmalert: use stringified label keys for duplicates map in recroding rules (#1301)
duplicates map helps to determine wheter extra labels has overriden
labels which make time series unique. It was using a sorted hashed
labels sequence as a key. But hashing algorithm could have collisions,
so it is more convenient to not use hashing at all.

Log message for recording rules duplicates was improved as well.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1293
2021-05-15 13:25:57 +03:00
Aliaksandr Valialkin
733706e6c6 lib/promscrape: reload auth tokens from files every second
Previously auth tokens were loaded at startup and couldn't be updated without vmagent restart.
Now there is no need in vmagent restart.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1297
2021-05-14 20:00:08 +03:00
Aliaksandr Valialkin
a15145e597 docs/Articles.md: add a link to https://fly.io/blog/measuring-fly/ 2021-05-14 19:47:43 +03:00
Aliaksandr Valialkin
24cbf8b66c docs/vmauth.md: sync with app/vmauth/README.md after 10a47af631 2021-05-14 18:13:10 +03:00
Aliaksandr Valialkin
10a47af631 app/{vmalert,vmauth}: explicitly set MaxIdleConnsPerHost in net/http.Client.Transport
By default MaxIdleConnsPerHost is set to 2. This limits the possibility to re-use http keep-alive connections.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1300
2021-05-14 18:12:24 +03:00
Aliaksandr Valialkin
e6ec442e96 docs/Single-server-VictoriaMetrics.md: document how to reduce memory usage when importing too long JSON lines into VictoriaMetrics
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1295
2021-05-14 17:22:08 +03:00
Denys Holius
70165a6758 Fix Cortex typo 2021-05-13 16:26:05 +03:00
Aliaksandr Valialkin
a422165dc6 app/vmagent/remotewrite: clarify the comment explaining why vmagent drops blocks if remote storage returns 400 or 409 status code 2021-05-13 16:16:16 +03:00
Aliaksandr Valialkin
fc3519fa26 lib/promscrape: limit scrape_timeout by scrape_interval like Prometheus does 2021-05-13 16:09:45 +03:00
Aliaksandr Valialkin
1f75ae6006 docs/CHANGELOG.md: document the bugfix from b4f5be8bd8 2021-05-13 11:18:39 +03:00
匠心零度
b4f5be8bd8 fix vagent imbalance problem (#1292)
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=2 -promscrape.config=/path/to/config.yml ...

Co-authored-by: lirenzuo <lirenzuo@shein.com>
2021-05-13 11:14:51 +03:00
Aliaksandr Valialkin
e6fda03e8f vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.14 to v1.0.15
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1289
2021-05-13 10:44:14 +03:00
Aliaksandr Valialkin
f6a641de62 lib/promscrape: exponentially increase retry interval on unsuccesful requests to scrape targets or to service discovery services
This should reduce CPU load at vmagent and at remote side when the remote side doesn't accept HTTP requests.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1289
2021-05-13 10:38:50 +03:00
Aliaksandr Valialkin
c0ec541559 lib/cgroup: document the ability to detect cgroup v2 memory and cpu limits. This is follow-up for b50024812e 2021-05-13 09:26:20 +03:00
Aliaksandr Valialkin
d7be2753c0 lib/storage: substitute GetTSDBStatusForDate with GetTSDBStatusWithFiltersForDate with nil tfss 2021-05-13 09:02:33 +03:00
Nikolay
b50024812e adds cgroupsv2 support (#1283)
* adds cgroupv2 limits support
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1269

* small fix

* changes Atoi to ParseUint
2021-05-13 09:02:13 +03:00
Aliaksandr Valialkin
a22a17dc66 lib/storage: merge getTSDBStatusForDate with getTSDBStatusWithFiltersForDate
These functions are non-trivial, while their code has minimal differences.
It is better from maintainability PoV to merge these functions into a single function.
2021-05-12 17:56:53 +03:00
Aliaksandr Valialkin
beddc0c0d5 docs/Single-server-VictoriaMetrics.md: typo fix: retuns->returns 2021-05-12 17:22:34 +03:00
Aliaksandr Valialkin
832651c6c2 app/vmselect: follow up after 8a0678678b
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1168
2021-05-12 17:18:30 +03:00
Nikolay
8a0678678b Adds tsdb match filters (#1282)
* init work on filters

* init propose for status filters

* fixes tsdb status
adds test

* fix bug

* removes checks from test
2021-05-12 15:18:45 +03:00
Aliaksandr Valialkin
cfd6aa28e1 lib/promscrape/discovery/kubernetes: refresh endpoints and endpointslices scrape targets every 5 seconds, since they may depend on changed service and pod objects
This should make endpoints and endpointslices scrape targets eventually consistent with the maximum delay of 5 seconds after the related service or pod object changes.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-12 14:10:34 +03:00
Aliaksandr Valialkin
afec68ad13 lib/httpserver: add new X-Server-Hostname header instead of overwriting already exsiting header
This makes possible tracking origins of chained requests over multiple hops.
2021-05-11 23:48:59 +03:00
Aliaksandr Valialkin
f2d5c4e2d0 lib/httpserver: return X-Server-Hostname http header in all the responses for better debuggability 2021-05-11 22:03:48 +03:00
Aliaksandr Valialkin
33f7bacb01 lib/storage: properly apply time range when matching an empty filter
It must match all the time series on the given time range.
Previously it was matched to all the time series without the restriction on the given time range.
2021-05-11 01:12:05 +03:00
Aliaksandr Valialkin
3fb3ce2a6d Revert ".github/dependabot.yml: remove automated dependency version checks"
This reverts commit 5b986c95dd.

This check verifies only dependencies needed for github-actions. This is OK.
2021-05-10 12:05:09 +03:00
Aliaksandr Valialkin
8b65920a8b Add make check-licenses rule for the ability to manually check licenses in vendored dependencies
This is a follow-up for c687536956
2021-05-10 11:54:43 +03:00
Aliaksandr Valialkin
5b986c95dd .github/dependabot.yml: remove automated dependency version checks
Dependency updates must be under manual control, since the resulting code diffs must be reviewed manually for the sake of security.
It is done with `make vendor-update` now.
2021-05-10 11:41:23 +03:00
Artem Navoiev
c687536956 Add vendor license checker, update codecov action, add dependbot for … (#1280)
* Add vendor license checker, update codecov action, add dependbot for github actions

* update gitingore, temprorary turn on check

* fix action name

* change action rules to trigger only when vendor changes

* remove obsolete line from main action
2021-05-10 11:38:56 +03:00
Aliaksandr Valialkin
229d9d6dd7 docs/CHANGELOG.md: document -datasource.roundDigits added at 5c448126dc 2021-05-10 11:18:26 +03:00
Roman Khavronenko
5c448126dc vmalert: add support for round_digits param in datasource package (#1278)
Starting from v1.56.0 VM supports `round_digits` which allows to limit
the number of digits after the decimal point in response value. The feature
can be used to reduce entropy of produced by recording rules values
and significantly improve the compression. See more details in link below.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/525
2021-05-10 11:11:45 +03:00
Aliaksandr Valialkin
3b0966c00c docs/CHANGELOG.md: document vmalert fix for state restoration on startup 2021-05-10 11:10:04 +03:00
Roman Khavronenko
4247168a2d vmalert: fix error when rule didn't start if restore failed (#1279)
Previously, `startGroup` could exit on restore errors despite the
`remoteRead.ignoreRestoreErrors` flag value. Now vmalert checks the
flag value before deciding whether to return error or just log it.
2021-05-10 11:06:31 +03:00
Aliaksandr Valialkin
6ff19096be docs/vmagent.md: add stream parsing mode chapter 2021-05-08 23:14:07 +03:00
Aliaksandr Valialkin
cbd0569ce2 docs/CHANGELOG.md: mention the comment, which gives an example of multi-level vminsert setup 2021-05-08 22:57:04 +03:00
Aliaksandr Valialkin
120927ab65 vendor: make vendor-update 2021-05-08 20:21:57 +03:00
Aliaksandr Valialkin
75c2c813fc lib/storage: remove dead code after the commit 3ccf7ea20c 2021-05-08 20:14:11 +03:00
Aliaksandr Valialkin
f8d50e9641 deployment/dm: update Go builder from v1.16.3 to v1.16.4
See https://github.com/golang/go/issues?q=milestone%3AGo1.16.4+label%3ACherryPickApproved for details
2021-05-08 20:04:05 +03:00
Aliaksandr Valialkin
7aea5f58c4 lib/ingestserver: properly close incoming connections during graceful shutdown 2021-05-08 19:52:58 +03:00
Aliaksandr Valialkin
12d733dd5d app/vminsert: add support for data ingestion via other vminsert nodes 2021-05-08 19:52:57 +03:00
Aliaksandr Valialkin
237e9f9fd7 app/vmalert: add missing comment for ErrStateRestore 2021-05-08 15:59:35 +03:00
Aliaksandr Valialkin
d6439490f5 docs/Single-server-VictoriaMetrics.md: add links to vmauth and vmgateway as auth proxy examples 2021-05-07 10:46:05 +03:00
Aliaksandr Valialkin
79ec35cef9 app/vmbackup: make sure that -snapshotName isnt set if -snapshot.createURL is set 2021-05-07 08:44:25 +03:00
Aliaksandr Valialkin
d9e3872b1c docs/CHANGELOG.md: document 904bbffc7f
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-05 20:33:05 +03:00
Aliaksandr Valialkin
4bea1afc6d docs/CHANGELOG.md: document 9cdd4696fe
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1252
2021-05-05 20:28:58 +03:00
Aliaksandr Valialkin
904bbffc7f lib/promscrape/discovery/kubernetes: start watchers for pods and services before starting watchers for endpoints
This should eliminate possible race when an update on endpoints depends on pods and/or services, which are missing in the cache yet.
This could result in missing targets based on endpoints or endpointslices.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-05-05 12:23:50 +03:00
Roman Khavronenko
9cdd4696fe vmalert: add flag to control behaviour on startup for state restore errors (#1265)
Alerting rules now can return specific error type ErrStateRestore to indicate
whether restore state procedure failed. Such errors were returned and logged
before as well. But now user can specify whether to just log these errors
(remoteRead.ignoreRestoreErrors=true) or to stop the process
(remoteRead.ignoreRestoreErrors=false). The latter is important when VM isn't
ready yet to serve queries from vmalert and it needs to wait.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1252
2021-05-05 10:07:19 +03:00
Denis Fondras
cdd1b06473 Update to OpenBSD 6.9 and VictoriaMetrics 1.59 (#1263)
* Update to OpenBSD 6.9 and VictoriaMetrics 1.58

While at it, also build vmagent

* Bump to v1.59 and remove zstd dependency
2021-05-03 14:14:13 +03:00
Aliaksandr Valialkin
76027093ca lib/storage: use WARNING instead of INFO level for logging dropped labels 2021-05-03 13:56:43 +03:00
Aliaksandr Valialkin
ce9e163e94 lib/httpserver: stop the process on panics in request handlers
Panics may leave the process in inconsistent state. That's why it is better to stop the process after the panic
instead of recovering from the panic. Unfortunately, the standard net/http.Server recovers panics in request handlers.
See https://github.com/golang/go/issues/16542 . That's lib/httpserver must stop the process on itself after the panic.
2021-05-03 11:59:40 +03:00
Aliaksandr Valialkin
988c3b386f docs/CHANGELOG.md: document the bugfix for proper removal of stale parts (477369b62f) 2021-05-03 11:38:15 +03:00
Nikolay
477369b62f adds stalePartsRemover (#1261)
for new created partitions
2021-05-03 11:34:00 +03:00
Aliaksandr Valialkin
9a930720b3 lib/promrelabel: add tests for removing the specified {label="value"} pair 2021-05-03 11:26:53 +03:00
Aliaksandr Valialkin
0969b446b3 deployment/docker: update base docker image from alpine:3.13.2 to alpine:3.13.5 2021-05-01 10:50:09 +03:00
Aliaksandr Valialkin
fb097ff774 docs/CHANGELOG.md: cut v1.59.0 2021-05-01 09:39:48 +03:00
Aliaksandr Valialkin
4e2ea844ca vendor: make vendor-update 2021-05-01 09:39:26 +03:00
Aliaksandr Valialkin
58d1e6eeea lib/storage: log dropped labels if the number of labels in a metric exceeds -maxLabelsPerTimeseries command-line flag value
This should improve debuggability for this case.
2021-05-01 09:27:55 +03:00
Aliaksandr Valialkin
508f500477 docs/vmalert.md: update docs after afca7b430c 2021-04-30 11:49:01 +03:00
Aliaksandr Valialkin
86bdb5ea95 docs/Cluster-VictoriaMetrics.md: document api/v1/series/count endpoint 2021-04-30 11:46:14 +03:00
Roman Khavronenko
0f988e5a31 vmalert: fix the typo in ApplyParams func (#1259) 2021-04-30 10:31:45 +03:00
Roman Khavronenko
afca7b430c vmalert: use rule's evaluationInterval as step param by default (#1258)
User still can override param by specifying `datasource.queryStep` flag.
2021-04-30 10:01:05 +03:00
Aliaksandr Valialkin
4394dc6cbb docs/CHANGELOG.md: document the change from f3a048288e
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
2021-04-30 09:54:41 +03:00
Roman Khavronenko
f3a048288e Vmalert: adjust time param for datasource queries according to evaluationInterval (#1257)
* Simplify arguments list for fn `queryDataSource` to improve readbility

* vmalert: adjust `time` param according to rule evaluation interval

With this change, vmalert will start to use rule's evaluation interval
for truncating the `time` param. This is mostly needed to produce consistent
time series with timestamps unaffected by vmalert start time. Now, timestamp
becomes predictable.
Additionally, adjustment is similar to what Grafana does for plotting range graphs.
Hence, recording rule series and recording rule expression plotted in grafana
suppose to become similar in most of cases.
2021-04-30 09:46:03 +03:00
Aliaksandr Valialkin
6fa5981e68 app/vmagent: list user-visible endpoints at http://vmagent:8429/
While at it, use common WriteAPIHelp function for the listing in vmagent, vmalert and victoria-metrics
2021-04-30 09:36:43 +03:00
Aliaksandr Valialkin
2ab1266593 lib/promscrape/discovery/kubernetes: remove a mutex at urlWatcher - use groupWatcher mutex for accessing all the urlWatcher children
This simplifies the code a bit and reduces the probability of improper mutex handling and deadlocks.
2021-04-29 10:14:26 +03:00
Aliaksandr Valialkin
25b8d71df5 vendor: update github.com/klauspost/compress from v1.12.1 to v1.12.2 2021-04-29 10:12:50 +03:00
Nikolay
4e5a88114a vmagent kubernetes_sd tests (#1253)
* first part of tests for kubernetes sd

* makes linter happy

* added more test cases

* adds pub/sub for tests
2021-04-29 10:10:24 +03:00
Nikolay
15609ee447 changes vmalert Querier with per rule querier (#1249)
* changes vmalert Querier with per rule querier
it allows to changes some parametrs based on rule setting
for instance - alert type, tenant for cluster version or event endpoint url.
2021-04-28 21:41:15 +01:00
Aliaksandr Valialkin
87179c6839 lib/{storage,mergeset}: fix unaligned 64-bit atomic operation panic for 32-bit architectures
The panic has been introduced in 56b6b893ce
2021-04-27 16:41:32 +03:00
Aliaksandr Valialkin
56b6b893ce lib/mergeset: split rows ingestion among multiple shards
This improves rows ingestion on systems with many CPU cores by reducing lock contention.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1244

Thanks to @waldoweng for the original idea and draft implementation at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1243
2021-04-27 15:36:34 +03:00
Aliaksandr Valialkin
2ec7d8b384 lib/promscrape/discovery/kubernetes: fix a deadlock introduced in eddba29664
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240

Thanks to @f41gh7 for providing the initial idea for deadlock fix at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1248
2021-04-27 14:57:51 +03:00
Aliaksandr Valialkin
f89c1f7f49 lib/storage: typo fix in info message when deleting the part outside the configured retention
Previously the message was displaying incorrect retention time
2021-04-27 13:32:46 +03:00
Aliaksandr Valialkin
60947fb2d5 lib/persistentqueue: eliminate possible data race when obtaining vm_persistentqueue_bytes_pending metric value 2021-04-27 00:25:52 +03:00
Roman Khavronenko
87018650dd vmalert: keep the returned timestamp when persisting recording rule (#1245)
Previously, vmalert used `lastExecTime` timestamp when writing recording rules
to the remote storage. This may be incorrect, if vmalert uses `datasource.lookback` flag,
which means rule's expression will be executed at some moment in the past.
To avoid such situations, vmalert now will use returned timestamp instead of `lastExecTime`.
2021-04-26 01:03:22 +03:00
Roman Khavronenko
d6f44977a7 docs: update per tenant stats page (#1246) 2021-04-25 09:34:28 +01:00
Aliaksandr Valialkin
00deb69e28 docs: ordering fix 2021-04-24 02:28:53 +03:00
Aliaksandr Valialkin
bf2439f962 vendor: make vendor-update 2021-04-24 01:34:07 +03:00
Aliaksandr Valialkin
20b77abc17 docs: update docs order 2021-04-24 01:27:13 +03:00
Aliaksandr Valialkin
e9898e1772 docs/Single-server-VictoriaMetrics.md: mention that the native export format can change between releases
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1203
2021-04-24 01:22:54 +03:00
Aliaksandr Valialkin
52b4b1605e app/vmagent/remotewrite: increase the maximum possible number of inmemory blocks for systems with high amounts of RAM
This should reduce the probability of using much slower file-based persistent queue
when vmagent processes metrics at high rate (millions of metrics per second).

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1235
2021-04-23 22:03:51 +03:00
Aliaksandr Valialkin
fb37b853e9 app/vmagent/remotewrite: count maxLabelsPerBlock as 10x of maxRowsPerBlock
This should increase block sizes and subsequently increase the maximum possible bandwidth per each connection to remote storage.
This, in turn, should reduce the probability of storing the data in local buffers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1235
2021-04-23 21:55:47 +03:00
Aliaksandr Valialkin
908e35affd lib/promscrape: apply scrape_timeout on receiving the first response byte for stream_parse: true scrape targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1017#issuecomment-767235047
2021-04-23 21:53:35 +03:00
Aliaksandr Valialkin
eddba29664 lib/promscrape/discovery/kubernetes: refresh role: endpoints targets on service object removal as Prometheus does
This is a follow-up for ae37cfd528

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-04-23 20:26:57 +03:00
Aliaksandr Valialkin
ae37cfd528 lib/promscrape/discovery/kubernetes: refresh endpoints and endpointslices targets on service object update like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
2021-04-23 20:11:40 +03:00
Aliaksandr Valialkin
f7d7236b44 Makefile: remove trailing whitespace 2021-04-23 11:05:57 +03:00
Aliaksandr Valialkin
478c56f281 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2021-04-22 14:49:16 +03:00
Aliaksandr Valialkin
758de04440 docs: add missing images for vmbackupmanager.md 2021-04-22 14:49:14 +03:00
Aliaksandr Valialkin
4c4c383f30 docs/vmbackupmanager.md: sync with app/vmbackupmanager/README.md 2021-04-22 14:44:07 +03:00
Aliaksandr Valialkin
bbebdf9ba1 lib/{storage,mergeset}: remove empty directories on startup. Such directories can be left after unclean shutdown on NFS storage
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1142
2021-04-22 13:02:44 +03:00
Aliaksandr Valialkin
1ab27582a3 docs/vmbackup.md: sync with app/vmbackup/README.md 2021-04-22 11:18:51 +03:00
Aliaksandr Valialkin
1fbc04facd app/vmbackup: typo fix: snaphsot -> snapshot
Follow-up for 9de0fa3649
2021-04-22 11:16:56 +03:00
Denys Holius
9de0fa3649 fixed typo; added example of usage vmbackup in docker-compose (#1237) 2021-04-22 11:14:13 +03:00
Aliaksandr Valialkin
7c4e460513 app/vmauth: parse url_prefix only once during config load 2021-04-21 10:55:29 +03:00
Aliaksandr Valialkin
6bc52fe41a all: rename https://victoriametrics.github.io to https://docs.victoriametrics.com 2021-04-20 20:16:17 +03:00
Aliaksandr Valialkin
5720b283fa all: consistency renaming Victoria Metrics -> VictoriaMetrics
VMInsert -> vminsert
VMSelect -> vmselect
VMStorage -> vmstorage
2021-04-20 11:45:48 +03:00
Denys Holius
9f9c99c8c0 removed DS_Store files from VM_logo.zip (#1233) 2021-04-20 11:41:07 +03:00
Aliaksandr Valialkin
187e3ec909 app/vmauth: follow-up for 6a81a89b3d 2021-04-20 10:58:29 +03:00
Nikolay
6a81a89b3d adds query params support for vmauth urlPrefix (#1226)
* adds query params support for vmauth urlPrefix

* Update app/vmauth/example_config.yml

* Update app/vmauth/example_config.yml

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-04-20 10:51:03 +03:00
Roman Khavronenko
b955fe0038 dashboard: use unit short for Labels limit exceeded panel (#1227) 2021-04-19 13:33:21 +03:00
Aliaksandr Valialkin
cc94afeacc docs/Cluster-VictoriaMetrics.md: sync with cluster README.md 2021-04-19 13:31:55 +03:00
Roman Khavronenko
f80156d9df dashboard: fix avg GC duration expression (#1228)
Previous expression was not correct.
2021-04-19 13:28:41 +03:00
Aliaksandr Valialkin
90bba22c25 .github/workflows: remove CODECOV_TOKEN 2021-04-19 11:11:14 +03:00
Aliaksandr Valialkin
b3d88610fd app/vmctl: update README.md according to bfecd0fd55 2021-04-16 12:09:45 +03:00
Denys Holius
650c143f6d removed MACOSX dirrectory from VM_logo.zip (#1224) 2021-04-16 07:30:15 +01:00
Denys Holius
bfecd0fd55 Added some explanation to vmctl docs (#1217)
Added explanation that vmctl do not make snapshot from Prometheus when run migration data
2021-04-15 18:40:48 +01:00
dereksfoster99
cf726448f2 Update PerTenantStatistic.md 2021-04-14 20:40:53 +03:00
Aliaksandr Valialkin
8d244c5f7f vendor: update github.com/klauspost/compress from v1.11.13 to v1.12.1 2021-04-14 14:20:56 +03:00
Aliaksandr Valialkin
574b80f274 docs/Cluster-VictoriaMetrics.md: mention that sometimes -replicationFactor shouldnt be set at vmselect nodes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1207
2021-04-14 13:07:59 +03:00
ArtemVoitsekhovskyi
403a7b4a1f Grammar-correction (#1210) 2021-04-14 12:45:22 +03:00
ArtemVoitsekhovskyi
294c8b747b Grammar_correction (#1211) 2021-04-14 12:44:26 +03:00
Aliaksandr Valialkin
9b1999a5ff lib/uint64set: remove memory allocation in getOrCreateSmallPool()
This partially reverts fb82c4b9fa

It has been appeared that the additional memory allocation may result in higher GC pauses.
It is better to spend CPU time on copying bigger bucket16 structs instead of increasing query latencies due to higher GC pauses
2021-04-14 12:30:19 +03:00
f41gh7
e5db518c0f updates per-tenant stats docs
changes docs order
changes per-tenant-stats pic
2021-04-14 12:14:51 +03:00
Artem Navoiev
e9ee2122df [draft] per tenant statistic (#121)
* [draft] per tenant statistic

* updates metric name
update graph
adds link and example config

* quick fix

* adds grafana dashboard
adds example alert

Co-authored-by: f41gh7 <nik@victoriametrics.com>
2021-04-14 11:23:07 +03:00
Aliaksandr Valialkin
251747e253 lib/storage: code clarification: remove caching the found metricName in searchMetricName 2021-04-13 10:22:21 +03:00
Aliaksandr Valialkin
663a91bb82 docs: update -help output after the commit 77be3e3a82 2021-04-12 12:34:59 +03:00
Artem Navoiev
77be3e3a82 improve docs for cli flags (#1202)
* improve docs for cli flags

* improve docs for cli flags.2
2021-04-12 12:28:04 +03:00
Aliaksandr Valialkin
0b7e3510c8 docs: make docs-sync 2021-04-10 19:54:18 +03:00
Artem Navoiev
3ed113b390 update vmgateway.md (#1201) 2021-04-10 19:52:33 +03:00
Lapo Luchini
3d20fd20d0 Allow specifying build date from a variable (#1200)
Just like the existing infrastructure for `BUILDINFO_TAG`, this can ease the production of [reproducible builds](https://wiki.freebsd.org/ReproducibleBuilds).
(e.g. in FreeBSD the date the port was committed is used at build time, not the actual build time, so that an identical port produced at different times produces an identical executable)
2021-04-10 15:46:53 +03:00
Roman Khavronenko
7644efae01 Docs update (#1199)
* docs: drop table of contents for `vmctl`

We already have it autogenerated on .github.io, so no need to keep it.

* docs: mention OpenTSDB migration feature for vmctl

* docs: sync docs for `vmalert`
2021-04-10 15:46:08 +03:00
John Seekins
a9d76b06a7 Improve documentation on OpenTSDB migration tool and fix a bug with hard offsets (#1198)
* add more documentation on OpenTSDB migration explaining what chunking means
* more clarification of OpenTSDB aggregations
* break out what a retention string becomes
* add more docs around retention strings
* add example of running program and fix mistake in how hard offsets are handled
* fix formatting
2021-04-10 13:24:18 +01:00
John Seekins
cd9786c2a7 OpenTSDB migration to VictoriaMetrics (#1089) 2021-04-08 20:58:06 +01:00
Roman Khavronenko
162681e60d add new alerts (#1195)
* alerts: backport `DiskRunsOutOfSpace` alert and some other tweaks from cluster branch

* alerts: add `ServiceDown` alert to detect "dead" services
2021-04-08 18:24:25 +03:00
Roman Khavronenko
4ed8de62ac vmalert: document template functions and mention them in README (#1197) 2021-04-08 18:19:08 +03:00
Aliaksandr Valialkin
06e962f141 docs/Cluster-VictoriaMetrics.md: recommend setting up official alerts for VictoriaMetrics components 2021-04-08 12:15:24 +03:00
Aliaksandr Valialkin
fac1e3810a docs/Single-server-VictoriaMetrics.md: recommend using the official alerts for VictoriaMetrics in Monitoring section 2021-04-08 12:12:45 +03:00
Aliaksandr Valialkin
edd1590ac7 dashboards/victoriametrics.json: typo fix: chur rate -> churn rate 2021-04-08 09:35:50 +03:00
Aliaksandr Valialkin
3f0bcbe067 lib/promscrape: create a single swosFunc per scrape_config 2021-04-08 09:31:48 +03:00
Aliaksandr Valialkin
3eadee6cb7 vendor: make vendor-update 2021-04-08 00:58:22 +03:00
Aliaksandr Valialkin
f1a22b097a docs/CHANGELOG.md: cut v1.58.0 2021-04-08 00:48:16 +03:00
Aliaksandr Valialkin
544821b719 app/vmselect/promql: fix tests after d3fa0ccabd 2021-04-08 00:18:01 +03:00
Aliaksandr Valialkin
d3fa0ccabd app/vmselect/promql: properly detect aggregate topk* and bottomk* aggregate functions in order to disable duplicate sorting
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1189
2021-04-08 00:09:40 +03:00
Aliaksandr Valialkin
cb12a8f0a8 app/vmselect: return data:null instead of data:[] from /api/v1/query_exemplars, since Grafana throws an error otherwise
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1186
2021-04-07 23:34:06 +03:00
Aliaksandr Valialkin
5a0938d807 lib/promscrape: do not spend CPU time on constructing scrapeWork key if clustering is disabled 2021-04-07 21:54:22 +03:00
Aliaksandr Valialkin
1177dca3da app/vmselect: do not sort series returned from topk* and bottomk* functions, since these series are already sorted in user-expected order
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1189
2021-04-07 14:16:08 +03:00
Aliaksandr Valialkin
75f309fbbf docs/Cluster-VictoriaMetrics.md: follow-up after c6a8ebb11f 2021-04-07 13:47:46 +03:00
Roman Khavronenko
c6a8ebb11f docs: update docs ordering and formatting (#1192)
The major change is adding `sort` directive to docs. For those docs which are copied
from internal packages `sort` is added via makefile command. For the rest it is added
manually since they're updated manually as well.

The rest of changes is connected with markdown formatting. For example, changing headers
in some files (`##` => `#`) makes navigation on .github.io to look better. This especially
useful for `changelog` docs.

Table of contents for `vmctl` is dropped, since we already have it autogenerated on .github.io.

No link changes expected. The corresponding PR to `cluster` branch will be made in follow-up PR.
2021-04-07 13:39:16 +03:00
Aliaksandr Valialkin
df32d2836c lib/storage: properly handle big time ranges passed to /api/v1/labels and /api/v1/label/<labelName>/values
It should be faster querying all the labels and/or all the values instead of querying per-day labels/values on time ranges exceeding maxDaysForPerDaySearch
2021-04-07 13:33:46 +03:00
Aliaksandr Valialkin
59f9960992 lib/promscrape/discovery: remove superflouos check in registerPendingAPIWatchers
The check `_, ok := uw.aws[aw]; !ok` isn't needed, since aw cannot exist in uw.aws
because of the check inside subscribeAPIWatcher
2021-04-07 13:07:39 +03:00
Aliaksandr Valialkin
3ec6639bbb lib/promscrape/discovery/kubernetes: register pending apiWatchers in uw.aws 2021-04-06 11:12:13 +03:00
Aliaksandr Valialkin
7d23598b33 app/vmselect: return dumb response on /api/v1/query_exemplars request
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1186
2021-04-05 23:25:08 +03:00
Aliaksandr Valialkin
78d35d4f46 Makefile: prepare arm64 and amd64 release archives for cluster version on make release command 2021-04-05 23:03:19 +03:00
Aliaksandr Valialkin
5f593b0ed3 lib/promscrape/discovery/kubernetes: remove superflouos mustStart and mustStop functions 2021-04-05 22:44:12 +03:00
Aliaksandr Valialkin
4a0d06d1db deployment/docker/docker-compose.yml: update Grafana from v7.5.1 to v7.5.2 2021-04-05 22:30:58 +03:00
Lu Jiajing
b59164cf33 fix access to nil *url.URL (#1180)
* fix access to nil *url.URL

Signed-off-by: Megrez Lu <lujiajing1126@gmail.com>

* Update lib/promscrape/discovery/kubernetes/api_watcher.go

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-04-05 22:25:31 +03:00
Aliaksandr Valialkin
b46194472f lib/promscrape/discovery/kubernetes: reduce CPU time spent on registering big number of Kubernetes objects shared among big number of scrape jobs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1182
2021-04-05 22:04:30 +03:00
Aliaksandr Valialkin
a51d0ec6ec lib/promscrape/discovery/kubernetes: load objects missing in local cache from api seriver in getObjectByRole()
This should fix possible race for `role: endpoints` and `role: endpointslices` service discovery,
when the referred `pod` and `service` objects aren't propagated to urlWatcher cache yet.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1182#issuecomment-813353359 for details.
2021-04-05 20:31:17 +03:00
Aliaksandr Valialkin
95dbebf512 lib/persistentqueue: delete corrupted persistent queue instead of throwing a fatal error
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1030
2021-04-05 19:26:11 +03:00
Aliaksandr Valialkin
f010d773d6 lib/promscrape/discovery/kubernetes: synchronously load Kubernetes objects on first access
Remove async registration of apiWatchers, since it breaks discovering `role: endpoints` and `role: endpointslices` targets,
which depend on pod and service objects.

There is no need in reloading `endpoints` and `endpointslices` targets if the referenced `pod` or `service` objects change,
since in this case the corresponding `endpoints` and `endpointslices` objects should also change because they contain
ResourceVersion of the referenced `pod` or `service` objects, which is modified on object update.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1182
2021-04-05 14:20:12 +03:00
Aliaksandr Valialkin
2c25b0322b lib/proxy: typo fix after a5c5b54c22 2021-04-05 13:45:24 +03:00
Aliaksandr Valialkin
a5c5b54c22 lib/proxy: add support for socks5 over tls proxy 2021-04-05 13:00:05 +03:00
Aliaksandr Valialkin
6742839fd6 lib/promscrape: pass X-Prometheus-Scrape-Timeout-Seconds header to scrape targets as Prometheus does 2021-04-05 12:15:24 +03:00
Aliaksandr Valialkin
9ff3ecb991 docs/CHANGELOG.md: explain why -sortLabels is set to false by default 2021-04-04 01:59:25 +03:00
Aliaksandr Valialkin
9a97941e2a docs/vmagent.md: mention that vmagent supports scraping via socks5 proxy 2021-04-04 01:45:52 +03:00
Aliaksandr Valialkin
fc2240fb22 docs/CHANGELOG.md: document the ability to use socks5 proxy
Follow-up for a4c6a3b3e1

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1177
2021-04-04 01:42:00 +03:00
Nikolay
a4c6a3b3e1 adds socks5 support for fasthttp client (#1178)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1177

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-04-04 01:29:54 +03:00
Aliaksandr Valialkin
500e625e8c lib/promscrape: properly send full url in GET request via simple HTTP proxy
This is a follow-up for a0ae0f86666a75ec57b45eab2429da7ab4a7b250

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1179
2021-04-04 01:20:06 +03:00
Aliaksandr Valialkin
5153410ced lib/promscrape: support for simple HTTP proxies without CONNECT method support such as https://github.com/prometheus-community/PushProx
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1179
2021-04-04 00:40:40 +03:00
Aliaksandr Valialkin
4c56b1a6dd lib/promscrape: add tests for authorization config, which has been added in df148f48b7 2021-04-03 22:13:22 +03:00
Aliaksandr Valialkin
7a0b964e8d app/vmselect/promql: do not delete dst_label if src_label is empty in label_copy(q, src_label, dst_label) and label_move(q, src_label, dst_label) 2021-04-03 22:05:06 +03:00
Aliaksandr Valialkin
6f3080f9fb docs/CHANGELOG.md: document the change from 3055ab0115 (add ability to pass "label=value" to the third argument to topk_* and bottomk_* functions 2021-04-03 21:42:08 +03:00
Aliaksandr Valialkin
e1d1708fa2 docs/Single-server-VictoriaMetrics.md: add missing link in heading to Graphite Render API usage chapter 2021-04-03 21:34:13 +03:00
Aliaksandr Valialkin
43f9842b6f lib/proxy: log response body on non-200 response code
This should improve debuggability for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1179
2021-04-03 03:03:55 +03:00
Aliaksandr Valialkin
2256b79a89 docs/vmgateway.md: update docs 2021-04-03 00:29:19 +03:00
Aliaksandr Valialkin
b8d9d6c326 docs/CHANGELOG.md: yet another typo fix 2021-04-03 00:25:39 +03:00
Aliaksandr Valialkin
22949911e9 docs/CHANGELOG.md: typo fix 2021-04-03 00:23:50 +03:00
Aliaksandr Valialkin
3055ab0115 app/vmselect/promql: add ability to set label value additionally to label name for the remaining sum of time series returned from topk_* and bottomk_* functions in the form: topk_min(N, m, "label=value") 2021-04-02 23:55:54 +03:00
Aliaksandr Valialkin
25e19c75c7 docs/{vmauth,vmgateway}.md: small fixes 2021-04-02 23:15:00 +03:00
Aliaksandr Valialkin
3c1b39c978 app/vmgateway: publish docs 2021-04-02 23:08:41 +03:00
Aliaksandr Valialkin
0db901617d app: do not process non-GET requests on at / handler 2021-04-02 22:54:06 +03:00
Aliaksandr Valialkin
569e58dcdf vendor: make vendor-update 2021-04-02 22:20:17 +03:00
Aliaksandr Valialkin
b1d0028e79 app/vmauth: add support for authorization via Authorization: Bearer <token> 2021-04-02 22:14:53 +03:00
Aliaksandr Valialkin
b88feb631e docs/vmagent.md: mention about proxy_authorization section 2021-04-02 21:24:11 +03:00
Aliaksandr Valialkin
df148f48b7 lib/promscrape: add support for authorization config in -promscrape.config as Prometheus 2.26 does
See https://github.com/prometheus/prometheus/pull/8512
2021-04-02 21:17:45 +03:00
Aliaksandr Valialkin
7f9c68cdcb lib/promscrape: add follow_redirect option to scrape_configs section like Prometheus does
See https://github.com/prometheus/prometheus/pull/8546
2021-04-02 19:56:40 +03:00
Aliaksandr Valialkin
d1dcbfd0f9 deployment/docker: upgrade Go builder from v1.16.2 to v1.16.3
See https://github.com/golang/go/issues?q=milestone%3AGo1.16.3+label%3ACherryPickApproved
2021-04-02 19:22:32 +03:00
Aliaksandr Valialkin
c79e4a2f90 app/vmselect/promql: remove the limit on the number of time series that can be sorted, since it may confuse users
Always sort time series returned from `/api/v1/query` and `/api/v1/query_range` unless `sort_*` function is used at top level of the query.
2021-04-02 15:02:08 +03:00
Aliaksandr Valialkin
5b08e6fb16 lib/promscrape/discovery/kubernetes: properly track objects with the same names in multiple namespaces
This is a follow-up for 12e4785fe8

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1170
2021-04-02 14:45:32 +03:00
Aliaksandr Valialkin
12e4785fe8 lib/promscrape/discovery/kubernetes: properly discover targets in multiple namespaces
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1170
2021-04-02 14:28:30 +03:00
Aliaksandr Valialkin
3967bd705a docs/CHANGELOG.md: mention about AWS IAM roles for tasks support for EC2 service discovery
Follow-up for fdb8995642
2021-04-02 13:10:03 +03:00
Nikolay
fdb8995642 Adds aws ECS credentials support (#1175) 2021-04-02 11:56:40 +03:00
Aliaksandr Valialkin
9d237408c6 Makefile: add missing -prod suffix to binary names in *_checksums.txt file
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1171
2021-04-02 11:55:31 +03:00
Aliaksandr Valialkin
759c938870 Makefile: properly generate checksums for *.tar.gz files
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1171
2021-04-02 11:50:04 +03:00
Aliaksandr Valialkin
dc9eafcd02 app/{vminsert,vmagent}: add -sortLabels command-line option for sorting time series labels before ingesting them in the storage
This option can be useful when samples for the same time series are ingested with distinct order of labels.
For example, metric{k1="v1",k2="v2"} and metric{k2="v2",k1="v1"}.
2021-03-31 23:27:58 +03:00
Aliaksandr Valialkin
e1f699bb6c lib/storage: reduce memory usage when ingesting samples for the same time series with distinct order of labels 2021-03-31 21:24:46 +03:00
Lapo Luchini
db963205cc Add vmutils-pure target (#1163) 2021-03-31 17:42:08 +03:00
Aliaksandr Valialkin
48275d8c12 app/vmagent/remotewrite: reduce memory usage when -remoteWrite.queues is set to a big value
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1167
2021-03-31 16:16:33 +03:00
Aliaksandr Valialkin
f888d194da docs/Articles.md: add a link to https://blog.kintone.io/entry/2021/03/31/175256 2021-03-31 15:14:35 +03:00
Aliaksandr Valialkin
33622c409c app/vmagent/remotewrite: reduce memory usage when samples with big number of labels are sent to remote storage 2021-03-31 00:44:54 +03:00
Aliaksandr Valialkin
3be0e6b087 docs/Single-server-VictoriaMetrics.md: update victoria-metrics -help output after e7fdea5953 2021-03-30 21:44:54 +03:00
Aliaksandr Valialkin
e7fdea5953 app/vmselect: add -search.maxStatusRequestDuration command-line flag for limiting the duration of requests to /api/v1/status/* and /api/v1/series/count 2021-03-30 21:41:35 +03:00
Denys Holius
2af39a96c5 deployment: Grafana version updated to 7.5.1 (#1161) 2021-03-30 20:43:54 +03:00
Aliaksandr Valialkin
2d3082bb55 docs/CHANGELOG.md: cut v1.57.1 2021-03-30 15:40:06 +03:00
Aliaksandr Valialkin
0fe8f11090 docs/CHANGELOG.md: mention about returned back type label for vm_tenant_inserted_rows_total metric
See 9b4e608199
2021-03-30 15:16:57 +03:00
Aliaksandr Valialkin
d58d5562f1 app/vmselect: remove -search.storageTimeout command-line flag, since it has the same meaning as -search.maxQueryDuration
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2021-03-30 15:04:54 +03:00
Aliaksandr Valialkin
7962cf1af8 app/vmselect: prevent from possible incomplete query results after timed out query
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2021-03-30 13:35:45 +03:00
Aliaksandr Valialkin
65c60cf413 Makefile: build arm binaries on make release-victoria-metrics
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1147
2021-03-29 23:44:39 +03:00
Aliaksandr Valialkin
75991277fa Makefile: build vmutils for arm on make release-vmutils
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1147
2021-03-29 23:15:29 +03:00
Aliaksandr Valialkin
1db1a29ffa all: increase minimum supported Go version for building VictoriaMetrics components from v1.14 to v1.15
This is needed after the commit c0ac740f93, which uses URL.Redacted() method,
which has been added in v1.15.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1147
2021-03-29 23:04:53 +03:00
Aliaksandr Valialkin
947b37ba8e docs/CHANGELOG.md: cut v1.57.0 2021-03-29 19:14:49 +03:00
Aliaksandr Valialkin
e6fd1f7875 docs/CHANGELOG.md: typo fixes 2021-03-29 15:46:35 +03:00
Aliaksandr Valialkin
5c7c0b2bda docs/CHANGELOG.md: mention about logging of metrics with too old timestamps in a single-node VictoriaMetrics
This is a follow up for aa81039b42
2021-03-29 15:42:46 +03:00
Aliaksandr Valialkin
5f4a9f782f docs/CHANGELOG.md: mention Graphite Render API fixes 2021-03-29 14:27:42 +03:00
Aliaksandr Valialkin
602a3d99ae docs/CHANGELOG.md: mention optimized query performance on systems with many CPU cores 2021-03-29 13:55:10 +03:00
Roman Khavronenko
cfdb6762e6 deployment: add new alert TooHighChurnRate24h (#1154)
Alert `TooHighChurnRate24h` suppose to cover cases when churn rate
is low but results in multiple times higher number than total
number of active series.
2021-03-29 12:38:03 +03:00
Roman Khavronenko
b1e49bab52 Dashboards update (#1153)
* dashboard: update single node dashboard

* add number of new series created over last 24h;
* bump version requirements.

* dashboard: update vmagent dashboard

* add panel for open file descriptors;
* add panel for disk I/O;
* add panel for `vmagent_remotewrite_packets_dropped_total` metric;
* bump version requirements.
2021-03-29 12:37:17 +03:00
Aliaksandr Valialkin
b75c2ce659 lib/uint64set: improve Set.Has() performance scalability on multi-CPU system
Do not update bucket32.hint on Set.Has() call, since it leads to memory ping-pong between CPU cores multi-CPU system
2021-03-29 12:33:47 +03:00
Aliaksandr Valialkin
2601cc0fb0 lib/storage: do not update b.nextIdx if no samples are removed because of retention 2021-03-29 12:00:21 +03:00
hagen1778
0c403bfd29 deployment: fix typo in vmalert docker-compose definition 2021-03-29 11:06:32 +03:00
Aliaksandr Valialkin
78188decf9 docs: document that vmagent drops data blocks when remote storage replies with 400 and 409 http status codes
This is a follow up for 1b7dc1e5a5.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
2021-03-26 14:44:06 +02:00
Aliaksandr Valialkin
c54cb3e63c app/vmagent/remotewrite: remove superflouos code after 1b7dc1e5a5 2021-03-26 13:59:46 +02:00
Aliaksandr Valialkin
8fc8ef1aba vendor: update github.com/klauspost/compress from v1.11.12 to v1.11.13 2021-03-26 13:57:01 +02:00
Nikolay
1b7dc1e5a5 Adds blocks drop (#1151)
* adds blocks drop at 400 BadRequest status code
recieved from remote storage,
not expected that remote storage will be able to handle it on retry

* removes error logging for dropped blocks,
its expected error
2021-03-26 14:17:59 +03:00
Aliaksandr Valialkin
f39c84b21f lib/promscrape/discovery/kubernetes: typo fix in error message 2021-03-26 12:46:14 +02:00
Aliaksandr Valialkin
9761ffd161 lib/promscrape/discovery/kubernetes: properly handle too old resource version error message from Kubernetes watch API 2021-03-26 12:28:10 +02:00
Aliaksandr Valialkin
aa81039b42 app/vmselect: log the metric which trigger rollup result cache reset
This should help finding the source of stale metrics
2021-03-25 21:31:39 +02:00
Aliaksandr Valialkin
50f790b5d7 docs/Cluster-VictoriaMetrics.md: mention that vmselect doesnt serve partial responses from export API
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1148
2021-03-25 21:04:13 +02:00
Aliaksandr Valialkin
136fc6217c vendor: make vendor-update 2021-03-25 17:56:10 +02:00
Aliaksandr Valialkin
5ec9e49103 docs/vmagent.md: add an example for -remoteWrite.label 2021-03-25 17:54:55 +02:00
Aliaksandr Valialkin
88f6286df7 docs/Cluster-VictoriaMetrics.md: sync with upstream 2021-03-25 17:18:18 +02:00
Aliaksandr Valialkin
f6529f932a docs: add a link to the repository from build instruction for all the VictoriaMetrics components 2021-03-25 17:14:42 +02:00
Aliaksandr Valialkin
2065d11300 docs/vmagent.md: cosmetic fixes 2021-03-25 17:11:19 +02:00
Aliaksandr Valialkin
35fb9bdee1 docs/vmagent.md: cosmetic fixes 2021-03-25 16:54:03 +02:00
Aliaksandr Valialkin
a647144616 docs/vmagent.md: typo fix: tupically -> typically 2021-03-25 16:48:45 +02:00
Aliaksandr Valialkin
c3c3e51f17 docs/vmalert.md: remove misleading -evaluationInterval=3s from example config args
3s evaluation interval is too small for practical setups. It can result in increased load on datasource.
So it is better to remove it from example config args, which are usually copy-pasted by novice users.
2021-03-25 15:29:06 +02:00
Aliaksandr Valialkin
0b2a66db30 app/vmselect/promql: do not merge time series during requests to /api/v1/query
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1141
2021-03-25 13:56:07 +02:00
Aliaksandr Valialkin
6e855d4b82 lib/storage: tune loopsCountPerMetricNameMatch according to production workload 2021-03-25 13:27:47 +02:00
Aliaksandr Valialkin
d4aadba9fa app/vmagent: add -promscrape.consul.waitTime command-line flag for configuring Consul service discovery wait time
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1144
2021-03-23 19:33:25 +02:00
Aliaksandr Valialkin
edc0a94a3c docs/CHANGELOG.md: mention the feature from 44a6cc5eca 2021-03-23 19:00:18 +02:00
Aliaksandr Valialkin
3a3d2165f9 lib/storage: do not reload metricName for the same metricID in Search.NextMetricBlock
This should speed up Search.NextMetricBlock a bit
2021-03-23 17:56:49 +02:00
Aliaksandr Valialkin
9c566f7db9 app/vmagent: mention -remoteWrite.maxDiskUsagePerURL in the descriptio of -remoteWrite.tmpDataPath flag 2021-03-23 16:38:48 +02:00
Nikolay
29f9ef9b7f changes consul_service label value (#1143)
according to prometheus discovery.
 It should mitigate issue with case sensetive services
https://github.com/hashicorp/consul/issues/5707
2021-03-23 15:35:01 +02:00
Aliaksandr Valialkin
331a6a2015 app/vmselect/graphite: accept and enforce extra_label in all the Graphite APIs 2021-03-23 15:29:16 +02:00
Aliaksandr Valialkin
b521d1d4f2 app/vmselect: move getEnforcedTagFiltersFromRequest to searchtuils, since it will be used in Graphite functions soon 2021-03-23 14:16:29 +02:00
Aliaksandr Valialkin
3cfb3a3683 lib/storage: respect the deadline passed to Storage.SearchMetricNames 2021-03-22 23:03:17 +02:00
Aliaksandr Valialkin
8e2afdf568 lib/storage: improve Search.NextMetricBlock performance by using MetricID->MetricName cache 2021-03-22 22:49:18 +02:00
Aliaksandr Valialkin
e17eb35147 docs/Single-server-VictoriaMetrics.md: sync with README.md 2021-03-22 17:51:43 +02:00
Aliaksandr Valialkin
65a61ff118 docs/Articles.md: add https://blog.cybozu.io/entry/2021/03/18/115743 2021-03-22 17:50:52 +02:00
Aliaksandr Valialkin
71b72304ae app/vmselect: improve description for -search.maxPointsPerTimeseries command-line flag 2021-03-22 16:45:34 +02:00
Aliaksandr Valialkin
44a6cc5eca app/{vminsert,vmagent}: use Influx field as metric name if measurement is empty and -influxSkipSingleField command-line is set
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
2021-03-22 13:53:53 +02:00
Aliaksandr Valialkin
910092ca4d lib/storage: tune loopsCountPerMetricNameMatch 2021-03-22 12:53:17 +02:00
Aliaksandr Valialkin
cef010d5f7 app/vmselect/promql: increment key prefix for faster reset for rollup result cache 2021-03-22 11:59:07 +02:00
Aliaksandr Valialkin
648d11b8e0 vendor: update github.com/VictoriaMetrics/metrics from v1.17.0 to v1.17.1 2021-03-18 18:53:07 +02:00
Aliaksandr Valialkin
fb83e97170 app/victoria-metrics: use flag.Parse instead of envflag.Parse for avoiding possible side effects of envflag 2021-03-18 18:20:22 +02:00
Aliaksandr Valialkin
b0c956a178 app/vmselect/graphite: follow-up after 529d7be26b 2021-03-18 16:30:20 +02:00
Nikolay
529d7be26b changes metricsFind api (#1137)
it should be able mitigate crash if label value contains *,[ or { symbols
2021-03-18 16:12:02 +02:00
Aliaksandr Valialkin
726f6ad804 lib/storage: small code simplification after 6cee5338b2 2021-03-18 15:21:13 +02:00
Aliaksandr Valialkin
6cee5338b2 lib/storage: prevent from infinite loop if {__graphite__="..."} filter matches a metric name with *, [ or { chars
The idea has been borrowed from https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1137
2021-03-18 14:53:47 +02:00
Aliaksandr Valialkin
e061a4fa19 docs/Single-server-VictoriaMetrics.md: remove outdated message about experimental mode for -pure builds 2021-03-18 13:49:17 +02:00
Aliaksandr Valialkin
4fb049bcba lib/fs: reduce the frequency of failed to remove directory ... due to NFS lock log warnings
Log `failed to remove directory ... due to NFS lock` warning only if the directory cannot be removed in one second.
2021-03-18 13:24:46 +02:00
Aliaksandr Valialkin
17d4a6e900 vendor: update github.com/VictoriaMetrics/metrics from v1.16.0 to v1.17.0 2021-03-17 23:23:04 +02:00
Aliaksandr Valialkin
904dababcc vendor: update github.com/VictoriaMetrics/metrics from v1.15.3 to v1.16.0
This adds the following new metrics for each VictoriaMetrics app:

* process_resident_memory_anonymous_bytes - the RSS share for memory allocated by the process itself.
  This share cannot be freed by the OS, so it must be taken into account by OOM killer.

* process_resident_memory_pagecache_bytes - the RSS share for page cache memory (aka memory-mapped files).
  This share can be freed by the OS at any time, so it must be ignored by OOM killer.
2021-03-17 17:59:40 +02:00
Aliaksandr Valialkin
45dabfac1b lib/storage: faster move heavy filters to the end of list 2021-03-17 15:12:13 +02:00
Aliaksandr Valialkin
b1713e3fcd app/vmselect/promql: typo fix after 9666834045 2021-03-17 15:12:11 +02:00
Aliaksandr Valialkin
9666834045 app/vmselect/promql: merge adjancent buckets with the smallest summary number of hits in buckets_limit() function
This should improve accuracy for the returned buckets
2021-03-17 14:31:41 +02:00
Aliaksandr Valialkin
20ac89c4e0 docs/CHANGELOG.md: cut v1.56.0 2021-03-17 02:04:55 +02:00
Aliaksandr Valialkin
bd0c6a095e docs/CHANGELOG.md: do not mention reduction in query duration
There was a signficant refactoring in the code responsible for time series search,
so it can result in both speed ups and slow downs depending on used queries.
2021-03-17 01:56:58 +02:00
Aliaksandr Valialkin
7bc728bf53 app/vmselect: add vm_index_search_duration_seconds histogram for monitoring the performance of index search 2021-03-17 01:17:41 +02:00
Aliaksandr Valialkin
828669e4e1 all: make golint happy 2021-03-17 00:49:28 +02:00
Aliaksandr Valialkin
ccfb0ae2d3 lib/storage: limit loops count in order to reduce max CPU usage during filter search 2021-03-17 00:49:26 +02:00
Aliaksandr Valialkin
576a80b3d9 lib/storage: do not modify filterLoopsCount stats with loopsCount stats
Such a modification can result in incorrect filter sorting later
2021-03-17 00:49:26 +02:00
Aliaksandr Valialkin
f104f3eb2a all: make golangci-lint happy after the commit 6378205415 2021-03-17 00:24:40 +02:00
Aliaksandr Valialkin
6378205415 lib/netutil: enable IPv6 UDP listening if -enableTCP6 command-line flag is passed to VictoriaMetrics
This is a follow-up for 18cfc4be7b

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1131
2021-03-17 00:16:17 +02:00
Nikolay
18cfc4be7b Adds udp6 support for ingest servers (#1134)
with flag -enableUDP6  https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1131
2021-03-17 00:03:06 +02:00
Aliaksandr Valialkin
0ce557951f app/vmselect/netstorage: reduce mutex contention when unpacking data on a system with high number of CPU cores 2021-03-16 21:51:31 +02:00
Aliaksandr Valialkin
35bb44d317 lib/uint64set: optimize bucket16.addMulti a bit 2021-03-16 21:09:07 +02:00
Aliaksandr Valialkin
aba955fa16 Makefile: prepare vmutils-windows-*.zip archive on make release-vmutils command
The archive contains the following executables for Windows:

* vmagent
* vmalert
* vmauth
* vmctl

Other components - vmbackup, vmrestore, victoria-metrics - aren't supported for Windows yet
2021-03-16 20:52:41 +02:00
Aliaksandr Valialkin
fd86a7dc1d lib/storage: time series search optimization according to production workload profiling
Do not pass filter metric ids to getMetricIDsForTagFilter, since it has been appeared that this slows down
the function by multiple times when it finds big number of metricIDs (tens of millions).
2021-03-16 20:01:43 +02:00
Aliaksandr Valialkin
264d3432ac vendor: make vendor-update 2021-03-16 19:08:36 +02:00
Aliaksandr Valialkin
e36fbfae5b lib/storage: further tuning for time series search 2021-03-16 18:46:22 +02:00
Aliaksandr Valialkin
f0a4157f89 app/vmselect/promql: do not crash if histogram_over_time() function name contains uppercase letters such as Histogram_over_time() 2021-03-16 12:24:21 +02:00
Aliaksandr Valialkin
645cf6746c vendor: update github.com/VictoriaMetrics/metrics from v1.15.2 to v1.15.3 2021-03-16 12:15:50 +02:00
Aliaksandr Valialkin
031d256810 docs/Articles.md: added a link to https://www.youtube.com/watch?v=QgLMztnj7-8 2021-03-15 23:01:49 +02:00
Aliaksandr Valialkin
dd7e82c34f app/vmstorage: add -logNewSeries command-line flag for determining the source of series churn rate 2021-03-15 22:38:50 +02:00
Aliaksandr Valialkin
37323c57c9 lib/influxutils: return response compatible with InfluxDB 1.8.4
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1124
2021-03-15 22:19:59 +02:00
f41gh7
acf2a82d3c typo fix 2021-03-15 23:03:52 +03:00
Aliaksandr Valialkin
85a95bf60c all: various fixes in command-line flag descriptions 2021-03-15 21:59:25 +02:00
Aliaksandr Valialkin
7c37e9aea9 app/{vminsert,vmagent}: a follow-up for b1aa8c3d8f
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1124
2021-03-15 21:37:55 +02:00
Nikolay
b1aa8c3d8f adds fake response for telegraph queries (#1130)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1124
2021-03-15 21:10:47 +02:00
Aliaksandr Valialkin
9c77e34ef9 lib/storage: further tuning for time series selector code 2021-03-15 20:31:34 +02:00
Aliaksandr Valialkin
923cdb0552 app/vmselect/promql: reduce overhead on scrape interval estimation
It should be enough to use the first 20 datapoints instead of 100 datapoints for scrape interval estimation.
2021-03-15 20:31:33 +02:00
Aliaksandr Valialkin
82aab87446 app/vmselect/promql: fix tests after 2dae0a2c47 2021-03-15 20:18:59 +02:00
Aliaksandr Valialkin
fb82c4b9fa lib/uint64set: reduce the size of bucket16 by storing smallPool by pointer.
This reduces CPU time spent on bucket16 copying inside bucket13.addBucketAtPos.
2021-03-15 17:23:31 +02:00
Aliaksandr Valialkin
eb103e1527 lib/uint64set: optimize Set.AddMulti for large sorted sets 2021-03-15 17:10:40 +02:00
Aliaksandr Valialkin
43504ebd14 lib/uint64set: optimize bucket16.add and bucket16.addMulti a bit 2021-03-15 16:58:12 +02:00
Aliaksandr Valialkin
fb935e6e2c docs/CHANGELOG.md: typo fix: FATURE -> FEATURE 2021-03-15 14:58:19 +02:00
Aliaksandr Valialkin
3ccf7ea20c lib/storage: tune per-day index search 2021-03-15 13:31:55 +02:00
Aliaksandr Valialkin
2dae0a2c47 app/vmselect: add round_digits query arg to /api/v1/query and /api/v1/query_range handlers for limiting the number of decimal digits after the point 2021-03-15 12:36:33 +02:00
Roman Khavronenko
b457739f87 Single dashboard (#1126)
* dashboard: update single node dashboard

* add panel `Open FDs` for file descriptors metrics;
* add panel `Disk writes/reads` to show the real read/write
load on storage layer;
* add `process_resident_memory_bytes` metric to memory usage panel;
* add stats panel to show available CPUs, memory and disk space;
* rm flags panel since it didn't prove its usefulness.

* alerts: add alert for reaching FDs limit
2021-03-15 12:04:24 +02:00
Aliaksandr Valialkin
6d91842c83 docs/Cluster-VictoriaMetrics.md: sync with cluster README.md 2021-03-15 11:51:40 +02:00
Aliaksandr Valialkin
c14dafce43 lib/promscrape: an attempt to reduce memory usage when vmagent scrapes targets with varying number of metrics
Do not cache too big byte buffers and too big writeRequestCtx objects,
since it is cheaper to re-create them instead of wasting RAM for their caching.

This reverts 7f6f350ee1
2021-03-15 11:45:39 +02:00
Aliaksandr Valialkin
7f6f350ee1 lib/promscrape: return back the logic for flushing big buffers to storage from the commit 3fd8653b40
This should reduce memory usage when vmagent scrapes targets with big number of metrics and `-promscrape.streamParse` isn't enabled
2021-03-14 22:26:00 +02:00
Aliaksandr Valialkin
b88806ecbf lib/promscrape/discovery/kubernetes: do not start object watcher until initial objects are loaded 2021-03-14 21:55:00 +02:00
Aliaksandr Valialkin
83edbb7cab lib/promscrape: retry service discovery in a few seconds if it starts returning 0 targets
This should reduce recovery time from temporary issues during service discovery
2021-03-14 21:53:23 +02:00
Aliaksandr Valialkin
bf15d6a6a2 lib/promscrape: remove duplicate target word in error message 2021-03-14 21:52:02 +02:00
Aliaksandr Valialkin
d409898515 lib/promscrape/discovery/kubernetes: further optimize kubernetes service discovery for the case with many scrape jobs
Do not re-calculate labels per each scrape job - reuse them instead for scrape jobs with identical Kubernetes role

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1113
2021-03-14 21:14:53 +02:00
Aliaksandr Valialkin
7a16e8e3a2 lib/promscrape/discovery: fixes after 133b288681
- Removed a deadlock in addAPIWatcher
- Do not create unused ScrapeWork objects
- Do not spend CPU resources on creating objectByKey map in addAPIWatcher

This work is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1125
2021-03-13 15:18:51 +02:00
Aliaksandr Valialkin
2096c6e464 app/vmselect/prometheus: typo fix after 7c002023d7 2021-03-12 12:19:36 +02:00
Aliaksandr Valialkin
7c002023d7 app/vmselect/prometheus: do not include datapoints with timestamps matching t-d when returning results from /api/v1/query?query=m[d]&time=t as Prometheus does 2021-03-12 12:16:50 +02:00
Aliaksandr Valialkin
43552fa8d3 docs/CaseStudies.md: fix incorrect number of active time series for Zhihu 2021-03-12 11:45:38 +02:00
Aliaksandr Valialkin
def014eb75 lib/promscrape/discovery/kubernetes: remove debug lines left after the commit 133b288681 2021-03-12 11:22:33 +02:00
Aliaksandr Valialkin
ca4d5ce037 lib/proxy: there is no need in cloning tlsCfg, which has been created two lines above 2021-03-12 10:47:02 +02:00
Aliaksandr Valialkin
895d5d1355 lib/proxy: set proxy address in tls.Config.ServerName instead of the target address
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1116
2021-03-12 10:41:33 +02:00
Aliaksandr Valialkin
a6a71ef861 lib/promscrape: add ability to configure proxy options via proxy_tls_config, proxy_basic_auth, proxy_bearer_token and proxy_bearer_token_file
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1116
2021-03-12 03:36:19 +02:00
Aliaksandr Valialkin
fa448806a5 deployment/docker: update Go builder from 1.16.1 to 1.16.2
See https://github.com/golang/go/issues?q=milestone%3AGo1.16.2+label%3ACherryPickApproved
2021-03-12 01:53:48 +02:00
Aliaksandr Valialkin
f3d712724c docs/Articles.md: add https://www.sensedia.com/post/monitoring-with-prometheus-alertmanager 2021-03-12 01:22:51 +02:00
Aliaksandr Valialkin
f669531506 lib/storage: further tune filters sorting logic 2021-03-12 00:53:04 +02:00
Aliaksandr Valialkin
133b288681 lib/promscrape/discovery/kubernetes: use a single watcher per apiURL
Previously multiple scrape jobs could create multiple watchers for the same apiURL. Now only a single watcher is used.
This should reduce load on Kubernetes API server when many scrape job configs use Kubernetes service discovery.
2021-03-11 16:43:04 +02:00
Aliaksandr Valialkin
dc0cb54d41 deployment/docker: update Go builder from 1.16.0 to 1.16.1
See https://github.com/golang/go/issues?q=milestone%3AGo1.16.1+label%3ACherryPickApproved
2021-03-11 16:43:03 +02:00
Aliaksandr Valialkin
c0ac740f93 lib/proxy: do not show inline basic auth passwords when logging errors related to proxy_url 2021-03-11 13:43:36 +02:00
Aliaksandr Valialkin
bebcb8130c lib/promscrape/discovery/kubernetes: localize Bookmark parsing code
This is a follow-up for e772d1c920
2021-03-11 13:08:08 +02:00
Aliaksandr Valialkin
971e3d83f7 docs/ExttendedPromQL.md: remove outdated doc 2021-03-11 12:41:20 +02:00
Brensted
238fe7d4e8 Update BestPractices.md (#1123)
update lists, hyperlinks fixed.
2021-03-11 11:39:14 +02:00
Aliaksandr Valialkin
e772d1c920 lib/promscrape/discovery/kubernetes: reduce load on Kubernetes API server by using watch bookmarks
This allows continuing object watch from the last bookbark instead of reloading all the objects
on watch errors or timeouts.

See https://kubernetes.io/docs/reference/using-api/api-concepts/#watch-bookmarks
2021-03-10 15:06:35 +02:00
Aliaksandr Valialkin
bc5c4add89 lib/httpserver: export vm_available_memory_bytes and vm_available_cpu_cores metrics
These metrics are useful for tracking the available memory and CPU cores for VictoriaMetrics apps.
2021-03-10 12:02:42 +02:00
Brensted
e02265cfa7 Add files via upload 2021-03-10 10:16:59 +02:00
Brensted
314f28fb38 Add files via upload 2021-03-10 10:14:34 +02:00
Brensted
45ede6ba98 Add files via upload 2021-03-10 10:08:38 +02:00
Brensted
09dc49e942 Add files via upload 2021-03-10 10:06:49 +02:00
Brensted
96dbe9bcbd Add files via upload 2021-03-10 10:05:18 +02:00
Ihor Borodin
d212f35ae8 Fixing examples of external.alert.source in documentation (#1120)
* Fixing examples of external.alert.source in documentation
2021-03-09 20:49:50 +00:00
Aliaksandr Valialkin
48fb0d1c4b vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.13 to v1.0.14 2021-03-09 21:34:07 +02:00
Aliaksandr Valialkin
2728b25783 docs/CHANGELOG.md: mention about the bugfix from 787242d7b0 2021-03-09 20:56:06 +02:00
Aliaksandr Valialkin
787242d7b0 lib/proxy: pass proxy hostname in Host header of the CONNECT request
This should resolve the following issue when connecting to tls proxy:

  cannot validate certificate for ... because it doesn't contain any IP SANs
2021-03-09 20:39:40 +02:00
Aliaksandr Valialkin
36fd007247 lib/proxy: set missing ServerName in TLS config for proxy_url.
While at it, allow setting Proxy-Authorization for `proxy_url` via `basic_auth` and `bearer_token` configs.
2021-03-09 18:58:18 +02:00
Nikolay
ad34f42467 Changes tlsConfig init for proxy connections (#1121)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1116
2021-03-09 18:51:00 +02:00
Aliaksandr Valialkin
3fd8653b40 lib/promscrape: apply sample_limit after metric relabeling is applied as Prometheus does
See the description for `sample_limit` option from Prometheus docs:

Per-scrape limit on number of scraped samples that will be accepted.
If more than this number of samples are present after metric relabeling
the entire scrape will be treated as failed. 0 means no limit.

https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
2021-03-09 15:47:18 +02:00
Aliaksandr Valialkin
4b18a4f026 lib/promscrape/discovery/kubernetes: remove too verbose logs about starting and stopping the watchers
Log the number of objects loaded per each watch url

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1113
2021-03-09 15:05:14 +02:00
Aliaksandr Valialkin
e44532b760 docs/CHANGELOG.md: mention about improved query performance at 18fe0ff14b 2021-03-09 13:12:25 +02:00
Aliaksandr Valialkin
fe8b12fbad app/vmselect/promql: follow up for 433fff0006 2021-03-09 12:48:44 +02:00
Nikolay
433fff0006 duplicate timeseries fix for prometheus_buckets function (#1119)
* try fix for prometheus_buckets

* merge possible end of the bucket collision
2021-03-09 12:26:23 +02:00
Aliaksandr Valialkin
534944b671 vendor: make vendor-update 2021-03-09 12:01:55 +02:00
Aliaksandr Valialkin
f6878eac36 vendor: update github.com/VictoriaMetrics/fasthttp from 1.0.12 to 1.0.13
This should fix a bug in vmagent with high CPU usage during failed scrapes with small `scrape_timeout`.
2021-03-09 11:45:02 +02:00
John Belmonte
364fdf4a56 spelling fix: adjacent (#1115) 2021-03-09 09:18:19 +02:00
Aliaksandr Valialkin
14a399dd06 lib/promscrape: add scrape_offset option to scrape_config
This option can be used for specifying the particular offset per each scrape interval for target scraping
2021-03-08 12:03:33 +02:00
Aliaksandr Valialkin
345980f78f lib/storage: go fmt 2021-03-08 12:03:31 +02:00
Aliaksandr Valialkin
18fe0ff14b lib/storage: tune loopsCount estimations in getMetricIDsForTagFilterSlow
The adjusted estmations give up to 2x lower median response times on 200qps /api/v1/query_range workload
2021-03-07 21:12:35 +02:00
Aliaksandr Valialkin
ab4f090c63 lib/promscrape/discovery/kubernetes: reduce memory usage further when big number of scrape jobs are configured for the same kubernetes_sd_config role
Serialize reloading per-role objects, so they don't occupy too much memory when objects for many scrape jobs are simultaneously refreshed.
Do not reload per-role objects if they were already refreshed by concurrent goroutines. This should reduce load on Kubernetes API server
when big number of scrape jobs are configured for the same Kubernetes role.

This is a follow-up for 17b87725ed

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1113
2021-03-07 19:51:03 +02:00
Aliaksandr Valialkin
1187ee5e16 lib/decimal: prevent exponent overflow when processing values close to zero
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1114
2021-03-05 18:52:47 +02:00
Aliaksandr Valialkin
47ac2051bb app/vmauth: allow using regexps in url_map paths
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1112
2021-03-05 18:21:36 +02:00
Aliaksandr Valialkin
17b87725ed lib/promscrape/discovery/kubernetes: reduce memory usage when Kubernetes service discovery is configured on a big number of scrape jobs
Previously vmagent was creating a separate Kubernetes object cache per each scrape job.
This could result in increased memory usage when monitoring a Kubernetes cluster with big number of objects (pods / nodes / services, etc.)
as seen at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1113

Now it uses a shared map of scrape objects across multiple scrape jobs.
2021-03-05 17:29:55 +02:00
Aliaksandr Valialkin
c9a25e931b lib/promscrape/discovery/kubernetes: move apiWatcher code to a separate file 2021-03-05 12:36:05 +02:00
Aliaksandr Valialkin
536d59914a deployment/docker: update base Docker image from alpine:3.13.1 to alpine:3.13.2
See https://www.alpinelinux.org/posts/Alpine-3.13.2-released.html
2021-03-05 10:35:10 +02:00
Aliaksandr Valialkin
68b8c48c86 docs/Articles.md: add https://dalefro.medium.com/prometheus-victoria-metrics-on-aws-ecs-62448e266090 2021-03-05 09:06:44 +02:00
Aliaksandr Valialkin
444fca89f8 lib/promscrape: make cluster membership calculations consistent across 32-bit and 64-bit architectures 2021-03-05 09:06:17 +02:00
Aliaksandr Valialkin
a14053ffa0 app/vmselect/promql: add histogram_avg(), histogram_stddev() and histogram_stdvar() functions to MetricsQL 2021-03-04 14:12:07 +02:00
Aliaksandr Valialkin
423cd981fb lib/promscrape: add -promscrape.cluster.replicationFactor command-line flag for replicating scrape targets among vmagent instances in the cluster 2021-03-04 10:20:15 +02:00
Aliaksandr Valialkin
d962cdbc13 docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics needs free resources for handling workload spikes 2021-03-04 09:57:53 +02:00
Aliaksandr Valialkin
9a48c1b53d lib/promscrape/discovery/kubernetes: fix tests after e154f4a644 2021-03-03 22:41:30 +02:00
Aliaksandr Valialkin
201b685b13 all: bump minimum supported Go version from 1.13 to 1.14 2021-03-03 15:57:13 +02:00
Aliaksandr Valialkin
90d6e94e5b vendor: update github.com/VictoriaMetrics/fastcache from v1.5.7 to v1.5.8 2021-03-03 15:52:51 +02:00
Aliaksandr Valialkin
f391e5a3a0 docs/vmagent.md: remove outdated suggestion for determining labels that lead to duplicate targets
The original labels for duplicate targets is already printed in the error message starting from 71ea4935de
2021-03-03 12:26:11 +02:00
Aliaksandr Valialkin
3a68b94487 docs/CHANGELOG.md: cut v1.55.1 release 2021-03-03 11:49:10 +02:00
Aliaksandr Valialkin
467cb9e3d1 docs/vmagent.md: make docs-sync after the commit 621bf03745 2021-03-03 10:52:40 +02:00
Roman Khavronenko
621bf03745 Vmagent docs upd (#1104)
* vmagent: port changes from https://github.com/VictoriaMetrics/VictoriaMetrics.github.io/pull/1

Thanks to @dereksfoster99 for this patch!

* vmagent: reword to make the meaning clear
2021-03-03 10:51:51 +02:00
Aliaksandr Valialkin
4c3ef78c05 docs/CHANGELOG.md: mention recent bugfixes from commits 7906316741 and e154f4a644 2021-03-03 10:50:31 +02:00
Aliaksandr Valialkin
e09a245b2b app/vmalert/README.md: sync with docs/vmalert.md 2021-03-03 10:43:59 +02:00
Nikolay
e154f4a644 Fix ingress discovery api (#1110) 2021-03-03 10:43:39 +02:00
Aliaksandr Valialkin
7906316741 lib/promscrape/discovery/kubernetes: properly check for nil pointer inside interface
See https://mangatmodi.medium.com/go-check-nil-interface-the-right-way-d142776edef1

This fixes a panic when the ScrapeWork is filtered out in swcFunc.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1108
2021-03-03 10:33:17 +02:00
Alexandros Orfanos
f4f8f21875 vmalert: docs update - status endpoint needs group ID, not group name (#1106) 2021-03-03 07:46:10 +00:00
970 changed files with 71702 additions and 28102 deletions

View File

@@ -4,14 +4,14 @@ about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
It would be great [upgrading](https://victoriametrics.github.io/#how-to-upgrade) to [the latest avaialble release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
It would be a great [upgrading](https://docs.victoriametrics.com/#how-to-upgrade)
to [the latest available release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
and verifying whether the bug is reproducible there.
It is also recommended reading [troubleshooting docs](https://victoriametrics.github.io/#troubleshooting).
It is also recommended reading [troubleshooting docs](https://docs.victoriametrics.com/#troubleshooting).
**To Reproduce**
Steps to reproduce the behavior.
@@ -19,9 +19,22 @@ Steps to reproduce the behavior.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Logs**
Check if any warnings or errors were logged by VictoriaMetrics components
or components in communication with VictoriaMetrics (e.g. Prometheus, Grafana).
**Screenshots**
If applicable, add screenshots to help explain your problem.
For VictoriaMetrics health-state issues please provide full-length screenshots
of Grafana dashboards if possible:
* [Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229)
* [Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)
See how to setup monitoring here:
* [monitoring for single-node VictoriaMetrics](https://docs.victoriametrics.com/#monitoring)
* [montioring for VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#monitoring)
**Version**
The line returned when passing `--version` command line flag to binary. For example:
```
@@ -30,15 +43,5 @@ victoria-metrics-20190730-121249-heads-single-node-0-g671d9e55
```
**Used command-line flags**
Command-line flags are listed as `flag{name="httpListenAddr", value=":443"} 1` lines at `/metrics` page.
See the following docs for details:
Please provide applied command-line flags used for running VictoriaMetrics and its components.
* [monitoring for single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#monitoring)
* [montioring for VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#monitoring)
**Additional context**
Add any other context about the problem here such as error logs from VictoriaMetrics and Prometheus,
`/metrics` output, screenshots from the official Grafana dashboards for VictoriaMetrics:
* [Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229)
* [Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)

6
.github/dependabot.yml vendored Normal file
View File

@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"

23
.github/workflows/check-licenses.yml vendored Normal file
View File

@@ -0,0 +1,23 @@
name: license-check
on:
push:
paths:
- 'vendor'
pull_request:
paths:
- 'vendor'
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.16
id: go
- name: Code checkout
uses: actions/checkout@master
- name: Check License
run: |
make check-licenses

View File

@@ -1,30 +0,0 @@
name: github-pages
on:
push:
paths:
- 'docs/*'
- 'README.md'
branches:
- master
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- name: publish
shell: bash
env:
TOKEN: ${{secrets.CI_TOKEN}}
run: |
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git gpages
cp docs/* gpages
cp README.md gpages
cd gpages
git config --local user.email "info@victoriametrics.com"
git config --local user.name "Vika"
git add .
git commit -m "update github pages"
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git"
git push "${remote_repo}"
cd ..
rm -rf gpages

View File

@@ -60,8 +60,7 @@ jobs:
GOOS=darwin go build -mod=vendor ./app/vmctl
CGO_ENABLED=0 GOOS=windows go build -mod=vendor ./app/vmagent
- name: Publish coverage
uses: codecov/codecov-action@v1.0.6
uses: codecov/codecov-action@v1.5.2
with:
token: ${{secrets.CODECOV_TOKEN}}
file: ./coverage.txt

View File

@@ -16,7 +16,7 @@ jobs:
TOKEN: ${{secrets.CI_TOKEN}}
run: |
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git wiki
cp docs/* wiki
cp -r docs/* wiki
cd wiki
git config --local user.email "info@victoriametrics.com"
git config --local user.name "Vika"

1
.gitignore vendored
View File

@@ -15,3 +15,4 @@
/package/temp-rpm-*
/package/*.deb
/package/*.rpm
.DS_store

5
.wwhrd.yml Normal file
View File

@@ -0,0 +1,5 @@
allowlist:
- Apache-2.0
- MIT
- BSD-3-Clause
- BSD-2-Clause

View File

@@ -7,7 +7,7 @@ contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
appearance, race, religion or sexual identity and orientation.
## Our Standards
@@ -24,9 +24,9 @@ Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Trolling, insulting/derogatory comments and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
* Publishing others' private information, such as physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
@@ -38,26 +38,26 @@ behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
reject comments, commits, code, wiki edits, issues and other contributions
that are not aligned to this Code of Conduct or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
threatening, offensive or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
address, posting via an official social media account or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
Instances of abusive, harassing or otherwise unacceptable behavior may be
reported by contacting the project team at info@victoriametrics.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
is deemed necessary and appropriate for the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

102
Makefile
View File

@@ -1,5 +1,6 @@
PKG_PREFIX := github.com/VictoriaMetrics/VictoriaMetrics
DATEINFO_TAG ?= $(shell date -u +'%Y%m%d-%H%M%S')
BUILDINFO_TAG ?= $(shell echo $$(git describe --long --all | tr '/' '-')$$( \
git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | openssl sha1 | cut -c 10-17)))
@@ -8,7 +9,7 @@ ifeq ($(PKG_TAG),)
PKG_TAG := $(BUILDINFO_TAG)
endif
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)'
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(DATEINFO_TAG)-$(BUILDINFO_TAG)'
.PHONY: $(MAKECMDGOALS)
@@ -53,6 +54,14 @@ vmutils: \
vmrestore \
vmctl
vmutils-pure: \
vmagent-pure \
vmalert-pure \
vmauth-pure \
vmbackup-pure \
vmrestore-pure \
vmctl-pure
vmutils-arm64: \
vmagent-arm64 \
vmalert-arm64 \
@@ -61,6 +70,20 @@ vmutils-arm64: \
vmrestore-arm64 \
vmctl-arm64
vmutils-arm: \
vmagent-arm \
vmalert-arm \
vmauth-arm \
vmbackup-arm \
vmrestore-arm \
vmctl-arm
vmutils-windows-amd64: \
vmagent-windows-amd64 \
vmalert-windows-amd64 \
vmauth-windows-amd64 \
vmctl-windows-amd64
release-snap:
snapcraft
snapcraft upload "victoriametrics_$(PKG_TAG)_multi.snap" --release beta,edge,candidate
@@ -71,11 +94,15 @@ release: \
release-victoria-metrics: \
release-victoria-metrics-amd64 \
release-victoria-metrics-arm \
release-victoria-metrics-arm64
release-victoria-metrics-amd64:
GOARCH=amd64 $(MAKE) release-victoria-metrics-generic
release-victoria-metrics-arm:
GOARCH=arm $(MAKE) release-victoria-metrics-generic
release-victoria-metrics-arm64:
GOARCH=arm64 $(MAKE) release-victoria-metrics-generic
@@ -85,11 +112,13 @@ release-victoria-metrics-generic: victoria-metrics-$(GOARCH)-prod
victoria-metrics-$(GOARCH)-prod \
&& sha256sum victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
victoria-metrics-$(GOARCH)-prod \
| sed s/-$(GOARCH)// > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
| sed s/-$(GOARCH)-prod/-prod/ > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
release-vmutils: \
release-vmutils-amd64 \
release-vmutils-arm64
release-vmutils-arm64 \
release-vmutils-arm \
release-vmutils-windows-amd64
release-vmutils-amd64:
GOARCH=amd64 $(MAKE) release-vmutils-generic
@@ -97,6 +126,12 @@ release-vmutils-amd64:
release-vmutils-arm64:
GOARCH=arm64 $(MAKE) release-vmutils-generic
release-vmutils-arm:
GOARCH=arm $(MAKE) release-vmutils-generic
release-vmutils-windows-amd64:
GOARCH=amd64 $(MAKE) release-vmutils-windows-generic
release-vmutils-generic: \
vmagent-$(GOARCH)-prod \
vmalert-$(GOARCH)-prod \
@@ -119,7 +154,25 @@ release-vmutils-generic: \
vmbackup-$(GOARCH)-prod \
vmrestore-$(GOARCH)-prod \
vmctl-$(GOARCH)-prod \
| sed s/-$(GOARCH)// > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
| sed s/-$(GOARCH)-prod/-prod/ > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
release-vmutils-windows-generic: \
vmagent-windows-$(GOARCH)-prod \
vmalert-windows-$(GOARCH)-prod \
vmauth-windows-$(GOARCH)-prod \
vmctl-windows-$(GOARCH)-prod
cd bin && \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
vmauth-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@@ -137,7 +190,7 @@ lint: install-golint
golint app/...
install-golint:
which golint || go install golang.org/x/lint/golint
which golint || GO111MODULE=off go get golang.org/x/lint/golint
errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./lib/...
@@ -152,7 +205,7 @@ errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./app/vmctl/...
install-errcheck:
which errcheck || go install github.com/kisielk/errcheck
which errcheck || GO111MODULE=off go get github.com/kisielk/errcheck
check-all: fmt vet lint errcheck golangci-lint
@@ -194,24 +247,43 @@ app-local-pure:
app-local-with-goarch:
GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-$(GOARCH)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
app-local-windows-with-goarch:
CGO_ENABLED=0 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-windows-$(GOARCH)$(RACE).exe $(PKG_PREFIX)/app/$(APP_NAME)
quicktemplate-gen: install-qtc
qtc
install-qtc:
which qtc || go install github.com/valyala/quicktemplate/qtc
which qtc || GO111MODULE=off go get github.com/valyala/quicktemplate/qtc
golangci-lint: install-golangci-lint
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
install-golangci-lint:
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.29.0
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.40.1
install-wwhrd:
which wwhrd || GO111MODULE=off go get github.com/frapposelli/wwhrd
check-licenses: install-wwhrd
wwhrd check -f .wwhrd.yml
copy-docs:
echo "---\nsort: ${ORDER}\n---\n" > ${DST}
cat ${SRC} >> ${DST}
# Copies docs for all components and adds the order tag.
# Cluster docs are supposed to be ordered as 9th.
# For The rest of docs is ordered manually.t
docs-sync:
cp app/vmagent/README.md docs/vmagent.md
cp app/vmalert/README.md docs/vmalert.md
cp app/vmauth/README.md docs/vmauth.md
cp app/vmbackup/README.md docs/vmbackup.md
cp app/vmrestore/README.md docs/vmrestore.md
cp app/vmctl/README.md docs/vmctl.md
cp README.md docs/Single-server-VictoriaMetrics.md
cp README.md docs/README.md
SRC=README.md DST=docs/Single-server-VictoriaMetrics.md ORDER=1 $(MAKE) copy-docs
SRC=app/vmagent/README.md DST=docs/vmagent.md ORDER=3 $(MAKE) copy-docs
SRC=app/vmalert/README.md DST=docs/vmalert.md ORDER=4 $(MAKE) copy-docs
SRC=app/vmauth/README.md DST=docs/vmauth.md ORDER=5 $(MAKE) copy-docs
SRC=app/vmbackup/README.md DST=docs/vmbackup.md ORDER=6 $(MAKE) copy-docs
SRC=app/vmrestore/README.md DST=docs/vmrestore.md ORDER=7 $(MAKE) copy-docs
SRC=app/vmctl/README.md DST=docs/vmctl.md ORDER=8 $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md ORDER=9 $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md ORDER=10 $(MAKE) copy-docs

552
README.md
View File

@@ -1,14 +1,14 @@
# VictoriaMetrics
[![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
[![Docker Pulls](https://img.shields.io/docker/pulls/victoriametrics/victoria-metrics.svg?maxAge=604800)](https://hub.docker.com/r/victoriametrics/victoria-metrics)
[![Slack](https://img.shields.io/badge/join%20slack-%23victoriametrics-brightgreen.svg)](http://slack.victoriametrics.com/)
[![Slack](https://img.shields.io/badge/join%20slack-%23victoriametrics-brightgreen.svg)](https://slack.victoriametrics.com/)
[![GitHub license](https://img.shields.io/github/license/VictoriaMetrics/VictoriaMetrics.svg)](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE)
[![Go Report](https://goreportcard.com/badge/github.com/VictoriaMetrics/VictoriaMetrics)](https://goreportcard.com/report/github.com/VictoriaMetrics/VictoriaMetrics)
[![Build Status](https://github.com/VictoriaMetrics/VictoriaMetrics/workflows/main/badge.svg)](https://github.com/VictoriaMetrics/VictoriaMetrics/actions)
[![codecov](https://codecov.io/gh/VictoriaMetrics/VictoriaMetrics/branch/master/graph/badge.svg)](https://codecov.io/gh/VictoriaMetrics/VictoriaMetrics)
![Victoria Metrics logo](logo.png "Victoria Metrics")
## VictoriaMetrics
<img src="logo.png" width="300" alt="VictoriaMetrics logo">
VictoriaMetrics is a fast, cost-effective and scalable monitoring solution and time series database.
@@ -28,30 +28,35 @@ See [features available for enterprise customers](https://victoriametrics.com/en
## Case studies and talks
Alphabetically sorted links to case studies:
Case studies:
* [adidas](https://victoriametrics.github.io/CaseStudies.html#adidas)
* [Adsterra](https://victoriametrics.github.io/CaseStudies.html#adsterra)
* [ARNES](https://victoriametrics.github.io/CaseStudies.html#arnes)
* [Brandwatch](https://victoriametrics.github.io/CaseStudies.html#brandwatch)
* [CERN](https://victoriametrics.github.io/CaseStudies.html#cern)
* [COLOPL](https://victoriametrics.github.io/CaseStudies.html#colopl)
* [Dreamteam](https://victoriametrics.github.io/CaseStudies.html#dreamteam)
* [Idealo.de](https://victoriametrics.github.io/CaseStudies.html#idealode)
* [MHI Vestas Offshore Wind](https://victoriametrics.github.io/CaseStudies.html#mhi-vestas-offshore-wind)
* [Synthesio](https://victoriametrics.github.io/CaseStudies.html#synthesio)
* [Wedos.com](https://victoriametrics.github.io/CaseStudies.html#wedoscom)
* [Wix.com](https://victoriametrics.github.io/CaseStudies.html#wixcom)
* [Zerodha](https://victoriametrics.github.io/CaseStudies.html#zerodha)
* [zhihu](https://victoriametrics.github.io/CaseStudies.html#zhihu)
* [adidas](https://docs.victoriametrics.com/CaseStudies.html#adidas)
* [Adsterra](https://docs.victoriametrics.com/CaseStudies.html#adsterra)
* [ARNES](https://docs.victoriametrics.com/CaseStudies.html#arnes)
* [Brandwatch](https://docs.victoriametrics.com/CaseStudies.html#brandwatch)
* [CERN](https://docs.victoriametrics.com/CaseStudies.html#cern)
* [COLOPL](https://docs.victoriametrics.com/CaseStudies.html#colopl)
* [Dreamteam](https://docs.victoriametrics.com/CaseStudies.html#dreamteam)
* [German Research Center for Artificial Intelligence](https://docs.victoriametrics.com/CaseStudies.html#german-research-center-for-artificial-intelligence)
* [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x)
* [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode)
* [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind)
* [Sensedia](https://docs.victoriametrics.com/CaseStudies.html#sensedia)
* [Synthesio](https://docs.victoriametrics.com/CaseStudies.html#synthesio)
* [Wedos.com](https://docs.victoriametrics.com/CaseStudies.html#wedoscom)
* [Wix.com](https://docs.victoriametrics.com/CaseStudies.html#wixcom)
* [Zerodha](https://docs.victoriametrics.com/CaseStudies.html#zerodha)
* [zhihu](https://docs.victoriametrics.com/CaseStudies.html#zhihu)
See also [articles and slides about VictoriaMetrics from our users](https://docs.victoriametrics.com/Articles.html#third-party-articles-and-slides-about-victoriametrics)
## Prominent features
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://victoriametrics.github.io/vmagent.html).
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://docs.victoriametrics.com/vmagent.html).
See [these docs](#prometheus-setup) for details.
* VictoriaMetrics supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
* VictoriaMetrics implements [MetricsQL](https://victoriametrics.github.io/MetricsQL.html) query language backwards compatible with PromQL.
* VictoriaMetrics implements [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) query language backwards compatible with PromQL.
* VictoriaMetrics provides global query view. Multiple Prometheus instances or any other data sources may ingest data into VictoriaMetrics.
Later this data may be queried via a single query.
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
@@ -76,7 +81,7 @@ Alphabetically sorted links to case studies:
* All the configuration is done via explicit command-line flags with reasonable defaults.
* All the data is stored in a single directory pointed by `-storageDataPath` command-line flag.
* Easy and fast backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
to S3 or GCS with [vmbackup](https://victoriametrics.github.io/vmbackup.html) / [vmrestore](https://victoriametrics.github.io/vmrestore.html).
to S3 or GCS with [vmbackup](https://docs.victoriametrics.com/vmbackup.html) / [vmrestore](https://docs.victoriametrics.com/vmrestore.html).
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
* Storage is protected from corruption on unclean shutdown (i.e. OOM, hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
* Supports metrics' scraping, ingestion and [backfilling](#backfilling) via the following protocols:
@@ -93,9 +98,10 @@ Alphabetically sorted links to case studies:
* [Prometheus exposition format](#how-to-import-data-in-prometheus-exposition-format).
* [Arbitrary CSV data](#how-to-import-csv-data).
* Supports metrics' relabeling. See [these docs](#relabeling) for details.
* Ideally works with big amounts of time series data from Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads.
* Can deal with high cardinality and high churn rate issues using [series limiter](#cardinality-limiter).
* Ideally works with big amounts of time series data from APM, Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads.
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
* See also technical [Articles about VictoriaMetrics](https://victoriametrics.github.io/Articles.html).
* See also technical [Articles about VictoriaMetrics](https://docs.victoriametrics.com/Articles.html).
## Operation
@@ -117,6 +123,7 @@ Alphabetically sorted links to case studies:
* [Prometheus querying API usage](#prometheus-querying-api-usage)
* [Prometheus querying API enhancements](#prometheus-querying-api-enhancements)
* [Graphite API usage](#graphite-api-usage)
* [Graphite Render API usage](#graphite-render-api-usage)
* [Graphite Metrics API usage](#graphite-metrics-api-usage)
* [Graphite Tags API usage](#graphite-tags-api-usage)
* [How to build from sources](#how-to-build-from-sources)
@@ -153,6 +160,8 @@ Alphabetically sorted links to case studies:
* [Security](#security)
* [Tuning](#tuning)
* [Monitoring](#monitoring)
* [TSDB stats](#tsdb-stats)
* [Cardinality limiter](#cardinality-limiter)
* [Troubleshooting](#troubleshooting)
* [Data migration](#data-migration)
* [Backfilling](#backfilling)
@@ -165,11 +174,12 @@ Alphabetically sorted links to case studies:
* [Contacts](#contacts)
* [Community and contributions](#community-and-contributions)
* [Reporting bugs](#reporting-bugs)
* [Victoria Metrics Logo](#victoria-metrics-logo)
* [VictoriaMetrics Logo](#victoria-metrics-logo)
* [Logo Usage Guidelines](#logo-usage-guidelines)
* [Font used](#font-used)
* [Color Palette](#color-palette)
* [We kindly ask](#we-kindly-ask)
* [List of command-line flags](#list-of-command-line-flags)
## How to start VictoriaMetrics
@@ -182,7 +192,7 @@ The following command-line flags are used the most:
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory. Default path is `victoria-metrics-data` in the current working directory.
* `-retentionPeriod` - retention for stored data. Older data is automatically deleted. Default retention is 1 month. See [these docs](#retention) for more details.
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see all the available flags with description and default values.
Other flags have good enough default values, so set them only if you really need this. Pass `-help` to see [all the available flags with description and default values](#list-of-command-line-flags).
See how to [ingest data to VictoriaMetrics](#how-to-import-time-series-data), how to [query VictoriaMetrics](#grafana-setup)
and how to [handle alerts](#alerting).
@@ -272,8 +282,8 @@ Read more about tuning remote write for Prometheus [here](https://prometheus.io/
It is recommended upgrading Prometheus to [v2.12.0](https://github.com/prometheus/prometheus/releases) or newer, since previous versions may have issues with `remote_write`.
Take a look also at [vmagent](https://victoriametrics.github.io/vmagent.html)
and [vmalert](https://victoriametrics.github.io/vmalert.html),
Take a look also at [vmagent](https://docs.victoriametrics.com/vmagent.html)
and [vmalert](https://docs.victoriametrics.com/vmalert.html),
which can be used as faster and less resource-hungry alternative to Prometheus.
@@ -288,7 +298,7 @@ http://<victoriametrics-addr>:8428
Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
Then build graphs with the created datasource using [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/)
or [MetricsQL](https://victoriametrics.github.io/MetricsQL.html). VictoriaMetrics supports [Prometheus querying API](#prometheus-querying-api-usage),
or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html). VictoriaMetrics supports [Prometheus querying API](#prometheus-querying-api-usage),
which is used by Grafana.
@@ -336,8 +346,11 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la
* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
* [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config)
* [docker_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config)
* [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config)
* [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config)
* [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)
* [http_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config)
Other `*_sd_config` types will be supported in the future.
@@ -346,7 +359,7 @@ The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders,
VictoriaMetrics also supports [importing data in Prometheus exposition format](#how-to-import-data-in-prometheus-exposition-format).
See also [vmagent](https://victoriametrics.github.io/vmagent.html), which can be used as drop-in replacement for Prometheus.
See also [vmagent](https://docs.victoriametrics.com/vmagent.html), which can be used as drop-in replacement for Prometheus.
## How to send data from InfluxDB-compatible agents such as [Telegraf](https://www.influxdata.com/time-series-platform/telegraf/)
@@ -413,6 +426,10 @@ while VictoriaMetrics stores them with *milliseconds* precision.
Extra labels may be added to all the written time series by passing `extra_label=name=value` query args.
For example, `/write?extra_label=foo=bar` would add `{foo="bar"}` label to all the ingested metrics.
Some plugins for Telegraf such as [fluentd](https://github.com/fangli/fluent-plugin-influxdb), [Juniper/open-nti](https://github.com/Juniper/open-nti)
or [Juniper/jitmon](https://github.com/Juniper/jtimon) send `SHOW DATABASES` query to `/query` and expect a particular database name in the response.
Comma-separated list of expected databases can be passed to VictoriaMetrics via `-influx.databaseNames` command-line flag.
## How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)
Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
@@ -450,11 +467,7 @@ The `/api/v1/export` endpoint should return the following response:
Data sent to VictoriaMetrics via `Graphite plaintext protocol` may be read via the following APIs:
* [Graphite API](#graphite-api-usage)
* [Prometheus querying API](#prometheus-querying-api-usage). Graphite metric names may special chars such as `-`, which may clash
with [MetricsQL operations](https://victoriametrics.github.io/MetricsQL.html). Such metrics can be queries via `{__name__="foo-bar.baz"}`.
VictoriaMetrics supports `__graphite__` pseudo-label for selecting time series with Graphite-compatible filters in [MetricsQL](https://victoriametrics.github.io/MetricsQL.html).
For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster
and it is easier to use when migrating from Graphite to VictoriaMetrics.
* [Prometheus querying API](#prometheus-querying-api-usage). VictoriaMetrics supports `__graphite__` pseudo-label for selecting time series with Graphite-compatible filters in [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html). For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster and it is easier to use when migrating from Graphite to VictoriaMetrics.
* [go-graphite/carbonapi](https://github.com/go-graphite/carbonapi/blob/main/cmd/carbonapi/carbonapi.example.victoriametrics.yaml)
## How to send data from OpenTSDB-compatible agents
@@ -543,9 +556,7 @@ VictoriaMetrics supports the following handlers from [Prometheus querying API](h
* [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
* [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
* [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). VictoriaMetrics accepts optional `topN=N` and `date=YYYY-MM-DD`
query args for this handler, where `N` is the number of top entries to return in the response and `YYYY-MM-DD` is the date for collecting the stats.
By default top 10 entries are returned and the stats is collected for the current day.
* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). See [these docs](#tsdb-stats) for details.
* [/api/v1/targets](https://prometheus.io/docs/prometheus/latest/querying/api/#targets) - see [these docs](#how-to-scrape-prometheus-exporters-such-as-node-exporter) for more details.
These handlers can be queried from Prometheus-compatible clients such as Grafana or curl.
@@ -557,21 +568,18 @@ All the Prometheus querying API handlers can be prepended with `/prometheus` pre
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg, which can be used for enforcing additional label filters for queries. For example,
`/api/v1/query_range?extra_label=user_id=123&query=<query>` would automatically add `{user_id="123"}` label filter to the given `<query>`. This functionality can be used
for limiting the scope of time series visible to the given tenant. It is expected that the `extra_label` query arg is automatically set by auth proxy sitting
in front of VictoriaMetrics. [Contact us](mailto:sales@victoriametrics.com) if you need assistance with such a proxy.
in front of VictoriaMetrics. See [vmauth](https://docs.victoriametrics.com/vmauth.html) and [vmgateway](https://docs.victoriametrics.com/vmgateway.html) as examples of such proxies.
VictoriaMetrics accepts relative times in `time`, `start` and `end` query args additionally to unix timestamps and [RFC3339](https://www.ietf.org/rfc/rfc3339.txt).
For example, the following query would return data for the last 30 minutes: `/api/v1/query_range?start=-30m&query=...`.
VictoriaMetrics accepts `round_digits` query arg for `/api/v1/query` and `/api/v1/query_range` handlers. It can be used for rounding response values to the given number of digits after the decimal point. For example, `/api/v1/query?query=avg_over_time(temperature[1h])&round_digits=2` would round response values to up to two digits after the decimal point.
By default, VictoriaMetrics returns time series for the last 5 minutes from `/api/v1/series`, while the Prometheus API defaults to all time. Use `start` and `end` to select a different time range.
VictoriaMetrics accepts additional args for `/api/v1/labels` and `/api/v1/label/.../values` handlers.
See [this feature request](https://github.com/prometheus/prometheus/issues/6178) for details:
* Any number [time series selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) via `match[]` query arg.
* Optional `start` and `end` query args for limiting the time range for the selected labels or label values.
Additionally VictoriaMetrics provides the following handlers:
* `/vmui` - Basic Web UI
* `/api/v1/series/count` - returns the total number of time series in the database. Some notes:
* the handler scans all the inverted index, so it can be slow if the database contains tens of millions of time series;
* the handler may count [deleted time series](#how-to-delete-time-series) additionally to normal time series due to internal implementation restrictions;
@@ -597,7 +605,11 @@ VictoriaMetrics supports the following Graphite APIs, which are needed for [Grap
All the Graphite handlers can be pre-pended with `/graphite` prefix. For example, both `/graphite/metrics/find` and `/metrics/find` should work.
VictoriaMetrics supports `__graphite__` pseudo-label for filtering time series with Graphite-compatible filters in [MetricsQL](https://victoriametrics.github.io/MetricsQL.html).
VictoriaMetrics accepts optional `extra_label=<label_name>=<label_value>` query arg for all the Graphite APIs. This arg can be used for limiting the scope of time series
visible to the given tenant. It is expected that the `extra_label` query arg is automatically set by auth proxy sitting in front of VictoriaMetrics.
[Contact us](mailto:sales@victoriametrics.com) if you need assistance with such a proxy.
VictoriaMetrics supports `__graphite__` pseudo-label for filtering time series with Graphite-compatible filters in [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html).
For example, `{__graphite__="foo.*.bar"}` is equivalent to `{__name__=~"foo[.][^.]*[.]bar"}`, but it works faster
and it is easier to use when migrating from Graphite to VictoriaMetrics.
@@ -646,14 +658,14 @@ to your needs or when testing bugfixes.
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make victoria-metrics` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make victoria-metrics` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make victoria-metrics-prod` from the root folder of the repository.
2. Run `make victoria-metrics-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-prod` binary and puts it into the `bin` folder.
### ARM build
@@ -662,24 +674,22 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make victoria-metrics-arm` or `make victoria-metrics-arm64` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make victoria-metrics-arm` or `make victoria-metrics-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-arm` or `victoria-metrics-arm64` binary respectively and puts it into the `bin` folder.
### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make victoria-metrics-arm-prod` or `make victoria-metrics-arm64-prod` from the root folder of the repository.
2. Run `make victoria-metrics-arm-prod` or `make victoria-metrics-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-arm-prod` or `victoria-metrics-arm64-prod` binary respectively and puts it into the `bin` folder.
### Pure Go build (CGO_ENABLED=0)
`Pure Go` mode builds only Go code without [cgo](https://golang.org/cmd/cgo/) dependencies.
This is an experimental mode, which may result in a lower compression ratio and slower decompression performance.
Use it with caution!
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make victoria-metrics-pure` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make victoria-metrics-pure` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `victoria-metrics-pure` binary and puts it into the `bin` folder.
### Building docker images
@@ -699,7 +709,7 @@ ROOT_IMAGE=scratch make package-victoria-metrics
## Start with docker-compose
[Docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose.yml)
helps to spin up VictoriaMetrics, [vmagent](https://victoriametrics.github.io/vmagent.html) and Grafana with one command.
helps to spin up VictoriaMetrics, [vmagent](https://docs.victoriametrics.com/vmagent.html) and Grafana with one command.
More details may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#folder-contains-basic-images-and-tools-for-building-and-running-victoria-metrics-in-docker).
@@ -722,7 +732,7 @@ The page will return the following JSON response:
Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-storageDataPath>`
is the command-line flag value. Snapshots can be archived to backup storage at any time
with [vmbackup](https://victoriametrics.github.io/vmbackup.html).
with [vmbackup](https://docs.victoriametrics.com/vmbackup.html).
The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
@@ -734,7 +744,7 @@ Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete_all` in order to
Steps for restoring from a snapshot:
1. Stop VictoriaMetrics with `kill -INT`.
2. Restore snapshot contents from backup with [vmrestore](https://victoriametrics.github.io/vmrestore.html)
2. Restore snapshot contents from backup with [vmrestore](https://docs.victoriametrics.com/vmrestore.html)
to the directory pointed by `-storageDataPath`.
3. Start VictoriaMetrics.
@@ -814,6 +824,8 @@ Optional `start` and `end` args may be added to the request in order to limit th
unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values.
The exported data can be imported to VictoriaMetrics via [/api/v1/import/native](#how-to-import-data-in-native-format).
The native export format may change in incompatible way between VictoriaMetrics releases, so the data exported from the release X
can fail to be imported into VictoriaMetrics release Y.
### How to export data in JSON line format
@@ -946,6 +958,8 @@ For example, `/api/v1/import?extra_label=foo=bar` would add `"foo":"bar"` label
Note that it could be required to flush response cache after importing historical data. See [these docs](#backfilling) for detail.
VictoriaMetrics parses input JSON lines one-by-one. It loads the whole JSON line in memory, then parses it and then saves the parsed samples into persistent storage. This means that VictoriaMetrics can occupy big amounts of RAM when importing too long JSON lines. The solution is to split too long JSON lines into smaller lines. It is OK if samples for a single time series are split among multiple JSON lines.
### How to import CSV data
@@ -1061,7 +1075,7 @@ VictoriaMetrics provides the following extra actions for relabeling rules:
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
See also [relabeling in vmagent](https://victoriametrics.github.io/vmagent.html#relabeling).
See also [relabeling in vmagent](https://docs.victoriametrics.com/vmagent.html#relabeling).
## Federation
@@ -1075,51 +1089,34 @@ on the interval `[now - max_lookback ... now]` is scraped for each time series.
For instance, `/federate?match[]=up&max_lookback=1h` would return last points on the `[now - 1h ... now]` interval. This may be useful for time series federation
with scrape intervals exceeding `5m`.
## Capacity planning
A rough estimation of the required resources for ingestion path:
VictoriaMetrics uses lower amounts of CPU, RAM and storage space on production workloads compared to competing solutions (Prometheus, Thanos, Cortex, TimescaleDB, InfluxDB, QuestDB, M3DB) according to [our case studies](https://docs.victoriametrics.com/CaseStudies.html).
* RAM size: less than 1KB per active time series. So, ~1GB of RAM is required for 1M active time series.
Time series is considered active if new data points have been added to it recently or if it has been recently queried.
The number of active time series may be obtained from `vm_cache_entries{type="storage/hour_metric_ids"}` metric
exported on the `/metrics` page.
VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with `-memory.allowedPercent` or `-memory.allowedBytes` flags.
VictoriaMetrics capacity scales linearly with the available resources. The needed amounts of CPU and RAM highly depends on the workload - the number of active time series, series churn rate, query types, query qps, etc. It is recommended setting up a test VictoriaMetrics for your production workload and iteratively scaling CPU and RAM resources until it becomes stable according to [troubleshooting docs](#troubleshooting). A single-node VictoriaMetrics works perfectly with the following production workload according to [our case studies](https://docs.victoriametrics.com/CaseStudies.html):
* CPU cores: a CPU core per 300K inserted data points per second. So, ~4 CPU cores are required for processing
the insert stream of 1M data points per second. The ingestion rate may be lower for high cardinality data or for time series with high number of labels.
See [this article](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) for details.
If you see lower numbers per CPU core, then it is likely active time series info doesn't fit caches,
so you need more RAM for lowering CPU usage.
* Ingestion rate: 1.5+ million samples per second
* Active time series: 50+ million
* Total time series: 5+ billion
* Time series churn rate: 150+ million of new series per day
* Total number of samples: 10+ trillion
* Queries: 200+ qps
* Query latency (99th percentile): 1 second
* Storage space: less than a byte per data point on average. So, ~260GB is required for storing a month-long insert stream
of 100K data points per second.
The actual storage size heavily depends on data randomness (entropy). Higher randomness means higher storage size requirements.
Read [this article](https://medium.com/faun/victoriametrics-achieving-better-compression-for-time-series-data-than-gorilla-317bc1f95932)
for details.
The needed storage space for the given retention (the retention is set via `-retentionPeriod` command-line flag) can be extrapolated from disk space usage in a test run. For example, if `-storageDataPath` directory size becomes 10GB after a day-long test run on a production workload, then it will need at least `10GB*100=1TB` of disk space for `-retentionPeriod=100d` (100-days retention period).
* Network usage: outbound traffic is negligible. Ingress traffic is ~100 bytes per ingested data point via
[Prometheus remote_write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write).
The actual ingress bandwidth usage depends on the average number of labels per ingested metric and the average size
of label values. The higher number of per-metric labels and longer label values mean the higher ingress bandwidth.
It is recommended leaving the following amounts of spare resources:
The required resources for query path:
* RAM size: depends on the number of time series to scan in each query and the `step`
argument passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
The higher number of scanned time series and lower `step` argument results in the higher RAM usage.
* CPU cores: a CPU core per 30 millions of scanned data points per second.
This means that heavy queries that touch big number of time series (over 10K) and/or big number data points (over 100M)
usually require more CPU resources than tiny queries that touch a few time series with small number of data points.
* Network usage: depends on the frequency and the type of incoming requests. Typical Grafana dashboards usually
require negligible network bandwidth.
* 50% of free RAM for reducing the probability of OOM (out of memory) crashes and slowdowns during temporary spikes in workload.
* 50% of spare CPU for reducing the probability of slowdowns during temporary spikes in workload.
* At least 30% of free storage space at the directory pointed by `-storageDataPath` command-line flag.
## High availability
* Install multiple VictoriaMetrics instances in distinct datacenters (availability zones).
* Pass addresses of these instances to [vmagent](https://victoriametrics.github.io/vmagent.html) via `-remoteWrite.url` command-line flag:
* Pass addresses of these instances to [vmagent](https://docs.victoriametrics.com/vmagent.html) via `-remoteWrite.url` command-line flag:
```bash
/path/to/vmagent -remoteWrite.url=http://<victoriametrics-addr-1>:8428/api/v1/write -remoteWrite.url=http://<victoriametrics-addr-2>:8428/api/v1/write
@@ -1144,7 +1141,7 @@ remote_write:
kill -HUP `pidof prometheus`
```
It is recommended to use [vmagent](https://victoriametrics.github.io/vmagent.html) instead of Prometheus for highly loaded setups.
It is recommended to use [vmagent](https://docs.victoriametrics.com/vmagent.html) instead of Prometheus for highly loaded setups.
* Now Prometheus should write data into all the configured `remote_write` urls in parallel.
* Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
@@ -1163,10 +1160,10 @@ VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-
is set to positive duration. For example, `-dedup.minScrapeInterval=60s` would de-duplicate data points
on the same time series if they fall within the same discrete 60s bucket. The earliest data point will be kept. In the case of equal timestamps, an arbitrary data point will be kept.
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs.
The recommended value for `-dedup.minScrapeInterval` must equal to `scrape_interval` config from Prometheus configs. It is recommended to have a single `scrape_interval` across all the scrape targets. See [this article](https://www.robustperception.io/keep-it-simple-scrape_interval-id) for details.
The de-duplication reduces disk space usage if multiple identically configured Prometheus instances in HA pair
write data to the same VictoriaMetrics instance. Note that these Prometheus instances must have identical
The de-duplication reduces disk space usage if multiple identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) or Prometheus instances in HA pair
write data to the same VictoriaMetrics instance. These vmagent or Prometheus instances must have identical
`external_labels` section in their configs, so they write data to the same time series.
@@ -1193,9 +1190,9 @@ Just start multiple VictoriaMetrics instances with distinct values for the follo
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
Then set up [vmauth](https://victoriametrics.github.io/vmauth.html) in front of VictoriaMetrics instances,
Then set up [vmauth](https://docs.victoriametrics.com/vmauth.html) in front of VictoriaMetrics instances,
so it could route requests from particular user to VictoriaMetrics with the desired retention.
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html).
The same scheme could be implemented for multiple tenants in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
## Downsampling
@@ -1233,7 +1230,7 @@ horizontally scalable long-term remote storage for really large Prometheus deplo
## Alerting
It is recommended using [vmalert](https://victoriametrics.github.io/vmalert.html) for alerting.
It is recommended using [vmalert](https://docs.victoriametrics.com/vmalert.html) for alerting.
Additionally, alerting can be set up with the following tools:
@@ -1258,7 +1255,7 @@ Consider setting the following command-line flags:
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://victoriametrics.github.io/vmauth.html)
Prefer authorizing all the incoming requests from untrusted networks with [vmauth](https://docs.victoriametrics.com/vmauth.html)
or similar auth proxy.
@@ -1281,7 +1278,7 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
## Monitoring
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
These metrics may be collected by [vmagent](https://victoriametrics.github.io/vmagent.html)
These metrics may be collected by [vmagent](https://docs.victoriametrics.com/vmagent.html)
or Prometheus by adding the corresponding scrape config to it.
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
@@ -1289,6 +1286,8 @@ For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics`
There are officials Grafana dashboards for [single-node VictoriaMetrics](https://grafana.com/dashboards/10229) and [clustered VictoriaMetrics](https://grafana.com/grafana/dashboards/11176).
There is also an [alternative dashboard for clustered VictoriaMetrics](https://grafana.com/grafana/dashboards/11831).
It is recommended setting up alerts in [vmalert](https://docs.victoriametrics.com/vmalert.html) or in Prometheus from [this config](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
The most interesting metrics are:
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
@@ -1309,15 +1308,53 @@ VictoriaMetrics also exposes currently running queries with their execution time
See the example of alerting rules for VM components [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml).
## TSDB stats
VictoriaMetrics returns TSDB stats at `/api/v1/status/tsdb` page in the way similar to Prometheus - see [these Prometheus docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). VictoriaMetrics accepts the following optional query args at `/api/v1/status/tsdb` page:
* `topN=N` where `N` is the number of top entries to return in the response. By default top 10 entries are returned.
* `date=YYYY-MM-DD` where `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
* `match[]=SELECTOR` where `SELECTOR` is an arbitrary [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) for series to take into account during stats calculation. By default all the series are taken into account.
* `extra_label=LABEL=VALUE`. See [these docs](#prometheus-querying-api-enhancements) for more details.
## Cardinality limiter
By default VictoriaMetrics doesn't limit the number of stored time series. The limit can be enforced by setting the following command-line flags:
* `-storage.maxHourlySeries` - limits the number of time series that can be added during the last hour. Useful for limiting the number of active time series.
* `-storage.maxDailySeries` - limits the number of time series that can be added during the last day. Useful for limiting daily churn rate.
Both limits can be set simultaneously. If any of these limits is reached, then incoming samples for new time series are dropped. A sample of dropped series is put in the log with `WARNING` level.
The exceeded limits can be [monitored](#monitoring) with the following metrics:
* `vm_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
* `vm_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
These limits are approximate, so VictoriaMetrics can underflow/overflow the limit by a small percentage (usually less than 1%).
## Troubleshooting
* It is recommended to use default command-line flag values (i.e. don't set them explicitly) until the need
of tweaking these flag values arises.
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
* It is recommended upgrading to the latest available release from [this page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
since the encountered issue could be already fixed there.
* It is recommended inspecting logs during troubleshooting, since they may contain useful information.
* It is recommended to have at least 50% of spare resources for CPU, disk IO and RAM, so VictoriaMetrics could handle short spikes in the workload without performance issues.
* VictoriaMetrics requires free disk space for [merging data files to bigger ones](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
It may slow down when there is no enough free space left. So make sure `-storageDataPath` directory
has at least 20% of free space. The remaining amount of free space
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
which would start background merge if they had more free disk space.
* VictoriaMetrics buffers incoming data in memory for up to a few seconds before flushing it to persistent storage.
This may lead to the following "issues":
@@ -1328,22 +1365,16 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM.
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics such as `vm_slow_row_inserts_total` and `vm_slow_metric_name_loads_total`, which could be used
as an indicator of low amounts of RAM. It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion and query performance in this case.
* If the order of labels for the same metrics can change over time (e.g. if `metric{k1="v1",k2="v2"}` may become `metric{k2="v2",k1="v1"}`),
then it is recommended running VictoriaMetrics with `-sortLabels` command-line flag in order to reduce memory usage and CPU usage.
* VictoriaMetrics prioritizes data ingestion over data querying. So if it has no enough resources for data ingestion,
then data querying may slow down significantly.
* VictoriaMetrics requires free disk space for [merging data files to bigger ones](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
It may slow down when there is no enough free space left. So make sure `-storageDataPath` directory
has at least 20% of free space comparing to disk size. The remaining amount of free space
can be [monitored](#monitoring) via `vm_free_disk_space_bytes` metric. The total size of data
stored on the disk can be monitored via sum of `vm_data_size_bytes` metrics.
See also `vm_merge_need_free_disk_space` metrics, which are set to values higher than 0
if background merge cannot be initiated due to free disk space shortage. The value shows the number of per-month partitions,
which would start background merge if they had more free disk space.
* If VictoriaMetrics doesn't work because of certain parts are corrupted due to disk errors,
then just remove directories with broken parts. It is safe removing subdirectories under `<-storageDataPath>/data/{big,small}/YYYY_MM` directories
when VictoriaMetrics isn't running. This recovers VictoriaMetrics at the cost of data loss stored in the deleted broken parts.
@@ -1360,10 +1391,9 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
It may be needed in order to suppress default gap filling algorithm used by VictoriaMetrics - by default it assumes
each time series is continuous instead of discrete, so it fills gaps between real samples with regular intervals.
* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page.
See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date,
while `topN` equals to 10.
* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page. See [these docs](#tsdb-stats) for details.
* New time series can be logged if `-logNewSeries` command-line flag is passed to VictoriaMetrics.
* VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.
This prevents from ingesting metrics with too many labels. It is recommended [monitoring](#monitoring) `vm_metrics_with_dropped_labels_total`
@@ -1375,15 +1405,20 @@ See the example of alerting rules for VM components [here](https://github.com/Vi
* VictoriaMetrics ignores `NaN` values during data ingestion.
## Cache removal
VictoriaMetrics uses various internal caches. These caches are stored to `<-storageDataPath>/cache` directory during graceful shutdown (e.g. when VictoriaMetrics is stopped by sending `SIGINT` signal). The caches are read on the next VictoriaMetrics startup. Sometimes it is needed to remove such caches on the next startup. This can be performed by placing `reset_cache_on_startup` file inside the `<-storageDataPath>/cache` directory before the restart of VictoriaMetrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1447) for details.
## Data migration
Use [vmctl](https://victoriametrics.github.io/vmctl.html) for data migration. It supports the following data migration types:
Use [vmctl](https://docs.victoriametrics.com/vmctl.html) for data migration. It supports the following data migration types:
* From Prometheus to VictoriaMetrics
* From InfluxDB to VictoriaMetrics
* From VictoriaMetrics to VictoriaMetrics
See [vmctl docs](https://victoriametrics.github.io/vmctl.html) for more details.
See [vmctl docs](https://docs.victoriametrics.com/vmctl.html) for more details.
## Backfilling
@@ -1414,7 +1449,7 @@ should be used only for one-off updates. It shouldn't be used for frequent updat
## Replication
Single-node VictoriaMetrics doesn't support application-level replication. Use cluster version instead.
See [these docs](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#replication-and-data-safety) for details.
See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#replication-and-data-safety) for details.
Storage-level replication may be offloaded to durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs).
@@ -1423,8 +1458,8 @@ See also [high availability docs](#high-availability) and [backup docs](#backups
## Backups
VictoriaMetrics supports backups via [vmbackup](https://victoriametrics.github.io/vmbackup.html)
and [vmrestore](https://victoriametrics.github.io/vmrestore.html) tools.
VictoriaMetrics supports backups via [vmbackup](https://docs.victoriametrics.com/vmbackup.html)
and [vmrestore](https://docs.victoriametrics.com/vmrestore.html) tools.
We also provide `vmbackupmanager` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
@@ -1460,7 +1495,7 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
* [Ansible role for installing single-node VictoriaMetrics](https://github.com/dreamteam-gg/ansible-victoriametrics-role).
* [Ansible role for installing cluster VictoriaMetrics](https://github.com/Slapper/ansible-victoriametrics-cluster-role).
* [Snap package for VictoriaMetrics](https://snapcraft.io/victoriametrics).
* [vmalert-cli](https://github.com/aorfanos/vmalert-cli) - a CLI application for managing [vmalert](https://victoriametrics.github.io/vmalert.html).
* [vmalert-cli](https://github.com/aorfanos/vmalert-cli) - a CLI application for managing [vmalert](https://docs.victoriametrics.com/vmalert.html).
## Third-party contributions
@@ -1480,7 +1515,7 @@ Contact us with any questions regarding VictoriaMetrics at [info@victoriametrics
Feel free asking any questions regarding VictoriaMetrics:
* [slack](http://slack.victoriametrics.com/)
* [slack](https://slack.victoriametrics.com/)
* [reddit](https://www.reddit.com/r/VictoriaMetrics/)
* [telegram-en](https://t.me/VictoriaMetrics_en)
* [telegram-ru](https://t.me/VictoriaMetrics_ru1)
@@ -1508,7 +1543,7 @@ Adhering `KISS` principle simplifies the resulting code and architecture, so it
Report bugs and propose new features [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues).
## Victoria Metrics Logo
## VictoriaMetrics Logo
[Zip](VM_logo.zip) contains three folders with different image orientations (main color and inverted version).
@@ -1536,3 +1571,268 @@ Files included in each folder:
* There should be sufficient clear space around the logo.
* Do not change spacing, alignment, or relative locations of the design elements.
* Do not change the proportions of any of the design elements or the design itself. You may resize as needed but must retain all proportions.
## List of command-line flags
Pass `-help` to VictoriaMetrics in order to see the list of supported command-line flags with their description:
```
-bigMergeConcurrency int
The maximum number of CPU cores to use for big merges. Default value is used if set to 0
-csvTrimTimestamp duration
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-dedup.minScrapeInterval duration
Leave only the first sample in every time series per each discrete interval equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details
-deleteAuthKey string
authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series and /tags/delSeries
-denyQueriesOutsideRetention
Whether to deny queries outside of the configured -retentionPeriod. When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. This may be useful when multiple data sources with distinct retentions are hidden behind query-tee
-dryRun
Whether to check only -promscrape.config and then exit. Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-finalMergeDelay duration
The delay before starting final merge for per-month partition after no new data is ingested into it. Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. Zero value disables final merge
-forceFlushAuthKey string
authKey, which must be passed in query string to /internal/force_flush pages
-forceMergeAuthKey string
authKey, which must be passed in query string to /internal/force_merge pages
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-graphiteListenAddr string
TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty
-graphiteTrimTimestamp duration
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
TCP address to listen for http connections (default ":8428")
-import.maxLineLen size
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
-influx.databaseNames array
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
Supports an array of values separated by comma or specified via multiple flags.
-influx.maxLineSize size
The maximum size in bytes for a single Influx line during parsing
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
-influxListenAddr string
TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<victoriametrics>:8428/write
-influxMeasurementFieldSeparator string
Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol (default "_")
-influxSkipMeasurement
Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'
-influxSkipSingleField
Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field
-influxTrimTimestamp duration
Trim timestamps for Influx line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-insert.maxQueueDuration duration
The maximum duration for waiting in the queue for insert requests due to -maxConcurrentInserts (default 1m0s)
-logNewSeries
Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
-maxLabelsPerTimeseries int
The maximum number of labels accepted per time series. Superfluous labels are dropped (default 30)
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-opentsdbHTTPListenAddr string
TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty
-opentsdbListenAddr string
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
-opentsdbTrimTimestamp duration
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
-opentsdbhttp.maxInsertRequestSize size
The maximum size of OpenTSDB HTTP put request
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
-opentsdbhttpTrimTimestamp duration
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-precisionBits int
The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64)
-promscrape.cluster.memberNum int
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
-promscrape.config.strictParse
Whether to allow only supported fields in -promscrape.config . By default unsupported fields are silently skipped
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
-promscrape.digitaloceanSDCheckInterval duration
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
-promscrape.disableCompression
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
-promscrape.disableKeepAlive
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
-promscrape.discovery.concurrency int
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
-promscrape.discovery.concurrentWaitTime duration
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
-promscrape.dnsSDCheckInterval duration
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
-promscrape.dockerSDCheckInterval duration
Interval for checking for changes in docker. This works only if docker_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config for details (default 30s)
-promscrape.dockerswarmSDCheckInterval duration
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
-promscrape.dropOriginalLabels
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
-promscrape.ec2SDCheckInterval duration
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
-promscrape.eurekaSDCheckInterval duration
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
-promscrape.fileSDCheckInterval duration
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 30s)
-promscrape.gceSDCheckInterval duration
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
-promscrape.httpSDCheckInterval duration
Interval for checking for changes in http endpoint service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
-promscrape.kubernetes.apiServerTimeout duration
How frequently to reload the full state from Kuberntes API server (default 30m0s)
-promscrape.kubernetesSDCheckInterval duration
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
-promscrape.maxDroppedTargets int
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
-promscrape.maxScrapeSize size
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
-promscrape.openstackSDCheckInterval duration
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
-promscrape.streamParse
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
-promscrape.suppressDuplicateScrapeTargetErrors
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
-promscrape.suppressScrapeErrors
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
-relabelConfig string
Optional path to a file with relabeling rules, which are applied to all the ingested metrics. See https://docs.victoriametrics.com/#relabeling for details
-relabelDebug
Whether to log metrics before and after relabeling with -relabelConfig. If the -relabelDebug is enabled, then the metrics aren't sent to storage. This is useful for debugging the relabeling configs
-retentionPeriod value
Data with timestamps outside the retentionPeriod is automatically deleted
The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1)
-search.cacheTimestampOffset duration
The maximum duration since the current time for response data, which is always queried from the original raw data, without using the response cache. Increase this value if you see gaps in responses due to time synchronization issues between VictoriaMetrics and data sources (default 5m0s)
-search.disableCache
Whether to disable response caching. This may be useful during data backfilling
-search.latencyOffset duration
The time when data points become visible in query results after the collection. Too small value can result in incomplete last points for query results (default 30s)
-search.logSlowQueryDuration duration
Log queries with execution time exceeding this value. Zero disables slow query logging (default 5s)
-search.maxConcurrentRequests int
The maximum number of concurrent search requests. It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration (default 8)
-search.maxExportDuration duration
The maximum duration for /api/v1/export call (default 720h0m0s)
-search.maxLookback duration
Synonym to -search.lookback-delta from Prometheus. The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. See also '-search.maxStalenessInterval' flag, which has the same meaining due to historical reasons
-search.maxPointsPerTimeseries int
The maximum points per a single timeseries returned from /api/v1/query_range. This option doesn't limit the number of scanned raw samples in the database. The main purpose of this option is to limit the number of per-series points returned to graphing UI such as Grafana. There is no sense in setting this limit to values bigger than the horizontal resolution of the graph (default 30000)
-search.maxQueryDuration duration
The maximum duration for query execution (default 30s)
-search.maxQueryLen size
The maximum search query length in bytes
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
-search.maxQueueDuration duration
The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
-search.maxStalenessInterval duration
The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons
-search.maxStatusRequestDuration duration
The maximum duration for /api/v1/status/* requests (default 5m0s)
-search.maxStepForPointsAdjustment duration
The maximum step when /api/v1/query_range handler adjusts points with timestamps closer than -search.latencyOffset to the current time. The adjustment is needed because such points may contain incomplete data (default 1m0s)
-search.maxTagKeys int
The maximum number of tag keys returned from /api/v1/labels (default 100000)
-search.maxTagValueSuffixesPerSearch int
The maximum number of tag value suffixes returned from /metrics/find (default 100000)
-search.maxTagValues int
The maximum number of tag values returned from /api/v1/label/<label_name>/values (default 100000)
-search.maxUniqueTimeseries int
The maximum number of unique time series each search can scan (default 300000)
-search.minStalenessInterval duration
The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval'
-search.queryStats.lastQueriesCount int
Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000)
-search.queryStats.minQueryDuration duration
The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats
-search.resetCacheAuthKey string
Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call
-search.treatDotsAsIsInRegexps
Whether to treat dots as is in regexp label filters used in queries. For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter
-selfScrapeInstance string
Value for 'instance' label, which is added to self-scraped metrics (default "self")
-selfScrapeInterval duration
Interval for self-scraping own metrics at /metrics page
-selfScrapeJob string
Value for 'job' label, which is added to self-scraped metrics (default "victoria-metrics")
-smallMergeConcurrency int
The maximum number of CPU cores to use for small merges. Default value is used if set to 0
-snapshotAuthKey string
authKey, which must be passed in query string to /snapshot* pages
-sortLabels
Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit
-storage.maxDailySeries int
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
-storage.maxHourlySeries int
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
-storageDataPath string
Path to storage data (default "victoria-metrics-data")
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
```

Binary file not shown.

View File

@@ -3,10 +3,8 @@ package main
import (
"flag"
"fmt"
"io"
"net/http"
"os"
"path"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
@@ -26,9 +24,8 @@ import (
var (
httpListenAddr = flag.String("httpListenAddr", ":8428", "TCP address to listen for http connections")
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
"Deduplication is disabled if the -dedup.minScrapeInterval is 0")
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the first sample in every time series per each discrete interval "+
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details")
dryRun = flag.Bool("dryRun", false, "Whether to check only -promscrape.config and then exit. "+
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
)
@@ -86,14 +83,20 @@ func main() {
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics.</h2></br>")
fmt.Fprintf(w, "See docs at <a href='https://victoriametrics.github.io/'>https://victoriametrics.github.io/</a></br>")
fmt.Fprintf(w, "Useful endpoints: </br>")
writeAPIHelp(w, [][]string{
if r.Method != "GET" {
return false
}
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics</h2></br>")
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/'>https://docs.victoriametrics.com/</a></br>")
fmt.Fprintf(w, "Useful endpoints:</br>")
httpserver.WriteAPIHelp(w, [][2]string{
{"/vmui", "Web UI"},
{"/targets", "discovered targets list"},
{"/api/v1/targets", "advanced information about discovered targets in JSON format"},
{"/metrics", "available service metrics"},
{"/api/v1/status/tsdb", "tsdb status page"},
{"/api/v1/status/top_queries", "top queries"},
{"/api/v1/status/active_queries", "active queries"},
})
return true
}
@@ -109,20 +112,11 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
return false
}
func writeAPIHelp(w io.Writer, pathList [][]string) {
pathPrefix := httpserver.GetPathPrefix()
for _, p := range pathList {
p, doc := p[0], p[1]
p = path.Join(pathPrefix, p)
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
}
}
func usage() {
const s = `
victoria-metrics is a time series database and monitoring solution.
See the docs at https://victoriametrics.github.io/
See the docs at https://docs.victoriametrics.com/
`
flagutil.Usage(s)
}

View File

@@ -22,7 +22,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -149,7 +148,7 @@ func setUp() {
}
func processFlags() {
envflag.Parse()
flag.Parse()
for _, fv := range []struct {
flag string
value string

View File

@@ -78,3 +78,9 @@ vmagent-local-with-goarch:
vmagent-pure:
APP_NAME=vmagent $(MAKE) app-local-pure
vmagent-windows-amd64:
GOARCH=amd64 APP_NAME=vmagent $(MAKE) app-local-windows-with-goarch
vmagent-windows-amd64-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-windows-amd64

View File

@@ -1,8 +1,8 @@
## vmagent
# vmagent
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
`vmagent` is a tiny but mighty agent which helps you collect metrics from various sources
and store them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage systems that support the `remote_write` protocol.
<img alt="vmagent" src="vmagent.png">
@@ -10,40 +10,42 @@ or any other Prometheus-compatible storage system that supports the `remote_writ
## Motivation
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
Also, we found that users infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
and RAM friendly to scrape metrics from Prometheus-compatible exporters into VictoriaMetrics.
Also, we found that our user's infrastructure are like snowflakes in that no two are alike. Therefore we decided to add more flexibility
to `vmagent` such as the ability to push metrics instead of pulling them. We did our best and will continue to improve vmagent.
## Features
* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
* Can be used as a drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
See [Quick Start](#quick-start) for details.
* Can add, remove and modify labels (aka tags) via Prometheus relabeling. Can filter data before sending it to remote storage. See [these docs](#relabeling) for details.
* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
* Accepts data via all ingestion protocols supported by VictoriaMetrics:
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
* Data in Prometheus exposition format. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
* Data in Prometheus exposition format. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
* Can replicate collected metrics simultaneously to multiple remote storage systems.
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets) for details.
* Works smoothly in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as the connection
to the remote storage is repaired. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared with Prometheus.
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets).
* Can efficiently scrape targets that expose millions of time series such as [/federate endpoint in Prometheus](https://prometheus.io/docs/prometheus/latest/federation/). See [these docs](#stream-parsing-mode).
* Can deal with high cardinality and high churn rate issues by limiting the number of unique time series sent to remote storage systems. See [these docs](#cardinality-limiter).
## Quick Start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
Please download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and configure the following flags to the `vmagent` binary in order to start scraping Prometheus targets:
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary number of remote storage systems.
* `-promscrape.config` with the path to Prometheus config file (usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics, the `-remoteWrite.url` argument can be specified multiple times to replicate data concurrently to an arbitrary number of remote storage systems.
Example command line:
@@ -51,17 +53,17 @@ Example command line:
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
If you only need to collect Influx data, then the following is sufficient:
If you only need to collect Influx data, then the following command is sufficient:
```
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
Pass `-help` to `vmagent` in order to see [the full list of supported command-line flags with their descriptions](#advanced-usage).
## Configuration update
@@ -85,140 +87,150 @@ There is also `-promscrape.configCheckInterval` command-line option, which can b
### IoT and Edge monitoring
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to the remote storage.
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to their remote storage.
It buffers the collected data in local files until the connection to remote storage becomes available and then sends the buffered
data to the remote storage. It re-tries sending the data to remote storage on any errors.
data to the remote storage. It re-tries sending the data to remote storage until any errors are resolved.
The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
`vmagent` works on various architectures from IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
`vmagent` works on various architectures from the IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
If you use Prometheus only for scraping metrics from various targets and forwarding those metrics to remote storage
then `vmagent` can replace Prometheus. Typically, `vmagent` requires lower amounts of RAM, CPU and network bandwidth compared with Prometheus.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instance.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again and then it sends the buffered data to the remote storage in order to prevent data gaps.
### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
`vmagent` can add, remove or update labels on the collected data before sending it to the remote storage. Additionally,
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
Please see [these docs](#relabeling) for details.
### Splitting data streams among multiple systems
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
which is applied independently for each configured `-remoteWrite.url` destination. For example, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and a real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive it's own subset of the collected data due to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
### Prometheus remote_write proxy
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
`vmagent` can be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via the `remote_write` API
at the`/api/v1/write` endpoint. Then apply relabeling and filtering and proxy it to another `remote_write` system .
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
Also, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
### remote_write for clustered version
Despite `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes always peformed in Promethes remote_write protocol. Therefore for clustered version `-remoteWrite.url` command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<customer-id>/prometheus/api/v1/write`
While `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes are always peformed in Promethes remote_write protocol. Therefore for the [clustered version](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html), `-remoteWrite.url` the command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<customer-id>/prometheus/api/v1/write`
## How to collect metrics in Prometheus format
Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
Specify the path to `prometheus.yml` file via `-promscrape.config` command-line flag. `vmagent` takes into account the following
sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
* `global`
* `scrape_configs`
All the other sections are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
Use `-remoteWrite.*` command-line flags instead for configuring remote write settings.
All other sections are ignored, including the [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
Use `-remoteWrite.*` command-line flag instead for configuring remote write settings.
The following scrape types in [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) section are supported:
* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
* `static_configs` - is for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
* `file_sd_configs` - is for scraping targets defined in external files (aka file-based service discover).
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
* `ec2_sd_configs` - is for scraping targets in Amazon EC2.
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
`vmagent` doesn't support `profile` config param and aws credentials file yet.
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
`vmagent` doesn't support the `profile` config param yet.
* `gce_sd_configs` - is for scraping targets in Google Compute Engine (GCE).
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
`vmagent` provides the following additional functionality for `gce_sd_config`:
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
* `consul_sd_configs` - for scraping targets registered in Consul.
* if `project` arg is missing then `vmagent` uses the project for the instance where it runs;
* if `zone` arg is missing then `vmagent` uses the zone for the instance where it runs;
* if `zone` arg is equal to `"*"`, then `vmagent` discovers all the zones for the given project;
* `zone` may contain an arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
* `consul_sd_configs` - is for scraping the targets registered in Consul.
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
* `dns_sd_configs` - is for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
* `openstack_sd_configs` - for scraping OpenStack targets.
* `openstack_sd_configs` - is for scraping OpenStack targets.
See [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) for details.
[OpenStack identity API v3](https://docs.openstack.org/api-ref/identity/v3/) is supported only.
* `dockerswarm_sd_configs` - for scraping Docker Swarm targets.
* `docker_sd_configs` - is for scraping Docker targets.
See [docker_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#docker_sd_config) for details.
* `dockerswarm_sd_configs` - is for scraping Docker Swarm targets.
See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details.
* `eureka_sd_configs` - for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
* `eureka_sd_configs` - is for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details.
* `digitalocean_sd_configs` is for scraping targerts registered in [DigitalOcean](https://www.digitalocean.com/)
See [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) for details.
* `http_sd_configs` is for scraping targerts registered in http service discovery.
See [http_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config) for details.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
Please file feature requests to [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
`vmagent` also support the following additional options in `scrape_config` section:
`vmagent` also support the following additional options in `scrape_configs` section:
* `disable_compression: true` - for disabling response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
in order to save network bandwidth.
* `disable_keepalive: true` - for disabling [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
By default `vmagent` uses keep-alive connections to scrape targets in order to reduce overhead on connection re-establishing.
* `disable_compression: true` - to disable response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
to save network bandwidth.
* `disable_keepalive: true` - to disable [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
By default, `vmagent` uses keep-alive connections to scrape targets to reduce overhead on connection re-establishing.
* `stream_parse: true` - for scraping targets in a streaming manner. This may be useful for targets exporting big number of metrics. See [these docs](#stream-parsing-mode).
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
Note that `vmagent` doesn't support `refresh_interval` option for these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
entries to 60s. Run `vmagent -help` in order to see default values for the `-promscrape.*CheckInterval` flags.
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders which are substituted by the corresponding `ENV_VAR` environment variable values.
## Adding labels to metrics
Labels can be added to metrics via the following mechanisms:
Labels can be added to metrics by the following mechanisms:
* Via `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in `-promscrape.config` file.
* Via `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`.
* The `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in the `-promscrape.config` file. They aren't added to metrics collected via other [data ingestion protocols](https://docs.victoriametrics.com/#how-to-import-time-series-data).
* The `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`. For example, the following command will start `vmagent`, which will add `{datacenter="foobar"}` label to all the metrics pushed to all the configured remote storage systems (all the `-remoteWrite.url` flag values):
```
/path/to/vmagent -remoteWrite.label=datacenter=foobar ...
```
## Relabeling
`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
Additionally it provides the following extra actions:
and also provides the following actions:
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
* `replace_all`: replaces all of the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the results in the `target_label`.
* `labelmap_all`: replaces all of the occurences of `regex` in all the label names with the `replacement`.
* `keep_if_equal`: keeps the entry if all the label values from `source_labels` are equal.
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
The relabeling can be defined in the following places:
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
* At the `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels. This relabeling can be debugged by passing `relabel_debug: true` option to the corresponding `scrape_config` section. In this case `vmagent` logs target labels before and after the relabeling and then drops the logged target.
* At the `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`. This relabeling can be debugged by passing `metric_relabel_debug: true` option to the corresponding `scrape_config` section. In this case `vmagent` logs metrics before and after the relabeling and then drops the logged metrics.
* At the `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage. This relabeling can be debugged by passing `-remoteWrite.relabelDebug` command-line option to `vmagent`. In this case `vmagent` logs metrics before and after the relabeling and then drops all the logged metrics instead of sending them to remote storage.
* At the `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`. This relabeling can be debugged by passing `-remoteWrite.urlRelabelDebug` command-line options to `vmagent`. In this case `vmagent` logs metrics before and after the relabeling and then drops all the logged metrics instead of sending them to the corresponding `-remoteWrite.url`.
Read more about relabeling in the following articles:
You can read more about relabeling in the following articles:
* [How to use Relabeling in Prometheus and VictoriaMetrics](https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2)
* [Life of a label](https://www.robustperception.io/life-of-a-label)
@@ -228,60 +240,9 @@ Read more about relabeling in the following articles:
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
## Scraping big number of targets
## Stream parsing mode
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` clustering).
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
spread scrape targets among a cluster of two `vmagent` instances:
```
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
```
## Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
If you have suggestions, improvements or found a bug - feel free to open an issue on github or add review to the dashboard.
`vmagent` also exports target statuses at the following handlers:
* `http://vmagent-host:8429/targets`. This handler returns human-readable plaintext status for every active target.
This page is convenient to query from command line with `wget`, `curl` or similar tools.
It accepts optional `show_original_labels=1` query arg, which shows the original labels per each target before applying relabeling.
This information may be useful for debugging target relabeling.
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes initialization for all service_discovery configs.
It may be useful for performing `vmagent` rolling update without scrape loss.
## Troubleshooting
* It is recommended [setting up the official Grafana dashboard](#monitoring) in order to monitor `vmagent` state.
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
since `vmagent` establishes at least a single TCP connection per each target.
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
and `http://vmagent-host:8429/api/v1/targets`.
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default up to `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value in order to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM, so big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if big number of scrape targets are dropped during relabeling.
* If `vmagent` scrapes big number of targets, then `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
* If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
By default `vmagent` reads the full response from scrape target into memory, then parses it, applies [relabeling](#relabeling) and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode. When this mode is enabled, then `vmagent` reads response from scrape target in chunks, then immediately processes every chunk and pushes the processed metrics to remote storage. This allows saving memory when scraping targets that expose millions of metrics. Stream parsing mode may be enabled either globally for all of the scrape targets by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
```yml
scrape_configs:
@@ -297,43 +258,176 @@ It may be useful for performing `vmagent` rolling update without scrape loss.
'match[]': ['{__name__!=""}']
```
Note that `sample_limit` option doesn't work if stream parsing is enabled, since the parsed data is pushed to remote storage as soon as it is parsed. So `sample_limit` option
has no sense during stream parsing.
Note that `sample_limit` option doesn't prevent from data push to remote storage if stream parsing is enabled because the parsed data is pushed to remote storage as soon as it is parsed.
* It is recommended to increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* If you see gaps on the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, then try increasing `-remoteWrite.queues`.
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage, so it starts dropping the buffered data
## Scraping big number of targets
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` horizontal scaling, sharding and clustering).
Each `vmagent` instance in the cluster must use identical `-promscrape.config` files with distinct `-promscrape.cluster.memberNum` values.
The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
spread scrape targets among a cluster of two `vmagent` instances:
```
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
```
By default each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances:
```
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=0 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ...
/path/to/vmagent -promscrape.cluster.membersCount=3 -promscrape.cluster.replicationFactor=2 -promscrape.cluster.memberNum=2 -promscrape.config=/path/to/config.yml ...
```
If each target is scraped by multiple `vmagent` instances, then data deduplication must be enabled at remote storage pointed by `-remoteWrite.url`.
See [these docs](https://docs.victoriametrics.com/#deduplication) for details.
## Scraping targets via a proxy
`vmagent` supports scraping targets via http, https and socks5 proxies. Proxy address must be specified in `proxy_url` option. For example, the following scrape config instructs
target scraping via https proxy at `https://proxy-addr:1234`:
```yml
scrape_configs:
- job_name: foo
proxy_url: https://proxy-addr:1234
```
Proxy can be configured with the following optional settings:
* `proxy_authorization` for generic token authorization. See [Prometheus docs for details on authorization section](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config)
* `proxy_bearer_token` and `proxy_bearer_token_file` for Bearer token authorization
* `proxy_basic_auth` for Basic authorization. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config).
* `proxy_tls_config` for TLS config. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config).
For example:
```yml
scrape_configs:
- job_name: foo
proxy_url: https://proxy-addr:1234
proxy_basic_auth:
username: foobar
password: secret
proxy_tls_config:
insecure_skip_verify: true
cert_file: /path/to/cert
key_file: /path/to/key
ca_file: /path/to/ca
server_name: real-server-name
```
## Cardinality limiter
By default `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`. The limit can be enforced by setting the following command-line flags:
* `-remoteWrite.maxHourlySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last hour. Useful for limiting the number of active time series.
* `-remoteWrite.maxDailySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last day. Useful for limiting daily churn rate.
Both limits can be set simultaneously. If any of these limits is reached, then samples for new time series are dropped instead of sending them to remote storage systems. A sample of dropped series is put in the log with `WARNING` level.
The exceeded limits can be [monitored](#monitoring) with the following metrics:
* `vmagent_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
* `vmagent_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
These limits are approximate, so `vmagent` can underflow/overflow the limit by a small percentage (usually less than 1%).
## Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. We recommend setting up regular scraping of this page
either through `vmagent` itself or by Prometheus so that the exported metrics may be analyzed later.
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard.
`vmagent` also exports the status for various targets at the following handlers:
* `http://vmagent-host:8429/targets`. This handler returns human-readable status for every active target.
This page is easy to query from the command line with `wget`, `curl` or similar tools.
It accepts optional `show_original_labels=1` query arg which shows the original labels per each target before applying the relabeling.
This information may be useful for debugging target relabeling.
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes it's initialization for all service_discovery configs.
It may be useful to perform `vmagent` rolling update without any scrape loss.
## Troubleshooting
* We recommend you [set up the official Grafana dashboard](#monitoring) in order to monitor the state of `vmagent'.
* We recommend you increase the maximum number of open files in the system (`ulimit -n`) when scraping a big number of targets,
as `vmagent` establishes at least a single TCP connection per target.
* If `vmagent` uses too big amounts of memory, then the following options can help:
* Enabling stream parsing. See [these docs](#stream-parsing-mode).
* Reducing the number of output queues with `-remoteWrite.queues` command-line option.
* Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option. Another option is to reduce memory limits in Docker and/or Kuberntes if `vmagent` runs under these systems.
* Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`, where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems.
* When `vmagent` scrapes many unreliable targets, it can flood the error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
and `http://vmagent-host:8429/api/v1/targets`.
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default the `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM. Therefore big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if a big number of scrape targets are dropped during relabeling.
* If `vmagent` scrapes a big number of targets then the `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
* If `vmagent` scrapes targets with millions of metrics per target (for example, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
we recommend enabling [stream parsing mode](#stream-parsing-mode) in order to reduce memory usage during scraping.
* We recommend you increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page grows constantly.
* If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, try increasing `-remoteWrite.queues`.
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage. Therefore it starts dropping the buffered data
if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
* `vmagent` drops data blocks if remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses. The number of dropped blocks can be monitored via `vmagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring).
* Use `-remoteWrite.queues=1` when `-remoteWrite.url` points to remote storage, which doesn't accept out-of-order samples (aka data backfilling). Such storage systems include Prometheus, Cortex and Thanos.
* `vmagent` buffers scraped data at the `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
If you don't want to send all the data from the directory to remote storage then simply stop `vmagent` and delete the directory.
* By default `vmagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because
the url may contain sensitive information such as auth tokens or passwords.
Pass `-remoteWrite.showURL` command-line flag when starting `vmagent` in order to see all the valid urls.
* If scrapes must be aligned in time (for instance, if they must be performed at the beginning of every hour), then set `scrape_align_interval` option
in the corresponding scrape config. For example, the following config aligns hourly scrapes to the nearest 10 minutes:
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at the beginning of some interval,
then `scrape_align_interval` option must be used. For example, the following config aligns hourly scrapes to the beginning of hour:
```yml
scrape_configs:
- job_name: foo
scrape_interval: 1h
scrape_align_interval: 10m
scrape_align_interval: 1h
```
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports
or they use init container. These errors can be either fixed or suppressed with `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
See available options below if you prefer fixing the root cause of the error:
* By default `vmagent` evenly spreads scrape load in time. If a particular scrape target must be scraped at specific offset, then `scrape_offset` option must be used.
For example, the following config instructs `vmagent` to scrape the target at 10 seconds of every minute:
The following `relabel_configs` section may help determining `__meta_*` labels resulting in duplicate targets:
```yml
- action: labelmap
regex: __meta_(.*)
scrape_configs:
- job_name: foo
scrape_interval: 1m
scrape_offset: 10s
```
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen to multiple ports
or they use an init container. These errors can either be fixed or suppressed with the `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
See the available options below if you prefer fixing the root cause of the error:
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
```yml
- action: keep_if_equal
@@ -350,25 +444,25 @@ It may be useful for performing `vmagent` rolling update without scrape loss.
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in `vmutils-*` archives there.
We recommend using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in the `vmutils-*` archives .
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent` from the root folder of the repository.
It builds `vmagent` binary and puts it into the `bin` folder.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmagent` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds the `vmagent` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-prod` from the root folder of the repository.
2. Run `make vmagent-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmagent-prod` binary and puts it into the `bin` folder.
### Building docker images
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
`<PKG_TAG>` is an auto-generated image tag, which depends on source code in [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
@@ -384,14 +478,14 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
It builds `vmagent-arm` or `vmagent-arm64` binary respectively and puts it into the `bin` folder.
### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of the repository.
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmagent-arm-prod` or `vmagent-arm64-prod` binary respectively and puts it into the `bin` folder.
@@ -399,13 +493,13 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
* Memory profile. It can be collected with the following command:
* Memory profile can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
```
* CPU profile. It can be collected with the following command:
* CPU profile can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
@@ -423,16 +517,16 @@ The collected profiles may be analyzed with [go tool pprof](https://github.com/g
```
./vmagent -help
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
vmagent collects metrics data via popular data ingestion protocols and routes them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmagent.html .
See the docs at https://docs.victoriametrics.com/vmagent.html .
-csvTrimTimestamp duration
Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-dryRun
Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
@@ -444,30 +538,33 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
-graphiteTrimTimestamp duration
Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
TCP address to listen for http connections. Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. Note that /targets and /metrics pages aren't available if -httpListenAddr='' (default ":8429")
-import.maxLineLen max_rows_per_line
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with max_rows_per_line query arg passed to /api/v1/export
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
-influx.maxLineSize value
-import.maxLineLen size
The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600)
-influx.databaseNames array
Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb
Supports an array of values separated by comma or specified via multiple flags.
-influx.maxLineSize size
The maximum size in bytes for a single Influx line during parsing
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 262144)
-influxListenAddr http://<vmagent>:8429/write
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144)
-influxListenAddr string
TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write
-influxMeasurementFieldSeparator string
Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol (default "_")
@@ -482,7 +579,7 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
@@ -492,17 +589,17 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxConcurrentInserts int
The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16)
-maxInsertRequestSize value
-maxInsertRequestSize size
The maximum size in bytes of a single Prometheus remote_write API request
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-opentsdbHTTPListenAddr string
@@ -511,9 +608,9 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty
-opentsdbTrimTimestamp duration
Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s)
-opentsdbhttp.maxInsertRequestSize value
-opentsdbhttp.maxInsertRequestSize size
The maximum size of OpenTSDB HTTP put request
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432)
-opentsdbhttpTrimTimestamp duration
Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms)
-pprofAuthKey string
@@ -522,85 +619,120 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
The number of number in the cluster of scrapers. It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster
-promscrape.cluster.membersCount int
The number of members in a cluster of scrapers. Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets
-promscrape.cluster.replicationFactor int
The number of members in the cluster, which scrape the same targets. If the replication factor is greater than 2, then the deduplication must be enabled at remote storage side. See https://docs.victoriametrics.com/#deduplication (default 1)
-promscrape.config string
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://victoriametrics.github.io/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details
-promscrape.config.dryRun
Checks -promscrape.config file for errors and unsupported fields and then exits. Returns non-zero exit code on parsing errors and emits these errors to stderr. See also -promscrape.config.strictParse command-line flag. Pass -loggerLevel=ERROR if you don't need to see info messages in the output.
-promscrape.config.strictParse
Whether to allow only supported fields in -promscrape.config . By default unsupported fields are silently skipped
-promscrape.configCheckInterval duration
Interval for checking for changes in '-promscrape.config' file. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
-promscrape.consulSDCheckInterval consul_sd_configs
-promscrape.consul.waitTime duration
Wait time used by Consul service discovery. Default value is used if not set
-promscrape.consulSDCheckInterval duration
Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s)
-promscrape.digitaloceanSDCheckInterval duration
Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s)
-promscrape.disableCompression
Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
-promscrape.disableKeepAlive disable_keepalive: true
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set disable_keepalive: true individually per each 'scrape_config` section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
-promscrape.disableKeepAlive
Whether to disable HTTP keep-alive connections when scraping all the targets. This may be useful when targets has no support for HTTP keep-alive connection. It is possible to set 'disable_keepalive: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control. Note that disabling HTTP keep-alive may increase load on both vmagent and scrape targets
-promscrape.discovery.concurrency int
The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.) (default 100)
-promscrape.discovery.concurrentWaitTime duration
The maximum duration for waiting to perform API requests if more than -promscrape.discovery.concurrency requests are simultaneously performed (default 1m0s)
-promscrape.dnsSDCheckInterval dns_sd_configs
-promscrape.dnsSDCheckInterval duration
Interval for checking for changes in dns. This works only if dns_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config for details (default 30s)
-promscrape.dockerswarmSDCheckInterval dockerswarm_sd_configs
-promscrape.dockerswarmSDCheckInterval duration
Interval for checking for changes in dockerswarm. This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details (default 30s)
-promscrape.dropOriginalLabels
Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. Note that this reduces debuggability for improper per-target relabeling configs
-promscrape.ec2SDCheckInterval ec2_sd_configs
-promscrape.ec2SDCheckInterval duration
Interval for checking for changes in ec2. This works only if ec2_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config for details (default 1m0s)
-promscrape.eurekaSDCheckInterval eureka_sd_configs
-promscrape.eurekaSDCheckInterval duration
Interval for checking for changes in eureka. This works only if eureka_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config for details (default 30s)
-promscrape.fileSDCheckInterval duration
Interval for checking for changes in 'file_sd_config'. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config for details (default 30s)
-promscrape.gceSDCheckInterval gce_sd_configs
-promscrape.gceSDCheckInterval duration
Interval for checking for changes in gce. This works only if gce_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config for details (default 1m0s)
-promscrape.httpSDCheckInterval duration
Interval for checking for changes in http service discovery. This works only if http_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config for details (default 1m0s)
-promscrape.kubernetes.apiServerTimeout duration
How frequently to reload the full state from Kuberntes API server (default 10m0s)
-promscrape.kubernetesSDCheckInterval kubernetes_sd_configs
How frequently to reload the full state from Kuberntes API server (default 30m0s)
-promscrape.kubernetesSDCheckInterval duration
Interval for checking for changes in Kubernetes API server. This works only if kubernetes_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config for details (default 30s)
-promscrape.maxDroppedTargets droppedTargets
The maximum number of droppedTargets shown at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
-promscrape.maxScrapeSize value
-promscrape.maxDroppedTargets int
The maximum number of droppedTargets to show at /api/v1/targets page. Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. Note that the increased number of tracked dropped targets may result in increased memory usage (default 1000)
-promscrape.maxScrapeSize size
The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
-promscrape.openstackSDCheckInterval openstack_sd_configs
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216)
-promscrape.openstackSDCheckInterval duration
Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s)
-promscrape.streamParse stream_parse: true
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set stream_parse: true individually per each `scrape_config` section in `-promscrape.config` for fine grained control
-promscrape.suppressDuplicateScrapeTargetErrors duplicate scrape target
Whether to suppress duplicate scrape target errors; see https://victoriametrics.github.io/vmagent.html#troubleshooting for details
-promscrape.streamParse
Whether to enable stream parsing for metrics obtained from scrape targets. This may be useful for reducing memory usage when millions of metrics are exposed per each scrape target. It is posible to set 'stream_parse: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control
-promscrape.suppressDuplicateScrapeTargetErrors
Whether to suppress 'duplicate scrape target' errors; see https://docs.victoriametrics.com/vmagent.html#troubleshooting for details
-promscrape.suppressScrapeErrors
Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed
-remoteWrite.basicAuth.password array
Optional basic auth password to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.basicAuth.passwordFile array
Optional path to basic auth password to use for -remoteWrite.url. The file is re-read every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.basicAuth.username array
Optional basic auth username to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.bearerToken array
Optional bearer auth token to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.bearerTokenFile array
Optional path to bearer token file to use for -remoteWrite.url. The token is re-read from the file every second. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.flushInterval duration
Interval for flushing the data to remote storage. This option takes effect only when less than 10K data points per second are pushed to -remoteWrite.url (default 1s)
-remoteWrite.label array
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.maxBlockSize value
Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.maxBlockSize size
The maximum size in bytes of unpacked request to send to remote storage. It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
-remoteWrite.maxDiskUsagePerURL value
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 8388608)
-remoteWrite.maxDailySeries int
The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -remoteWrite.maxHourlySeries
-remoteWrite.maxDiskUsagePerURL size
The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. Disk usage is unlimited if the value is set to 0
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-remoteWrite.maxHourlySeries int
The maximum number of unique series vmagent can send to remote storage systems during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -remoteWrite.maxDailySeries
-remoteWrite.oauth2.clientID array
Optional OAuth2 clientID to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.oauth2.clientSecret array
Optional OAuth2 clientSecret to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.oauth2.clientSecretFile array
Optional OAuth2 clientSecretFile to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.oauth2.scopes array
Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.oauth2.tokenUrl array
Optional OAuth2 tokenURL to use for -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.proxyURL array
Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.queues int
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 4)
The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 2 * numberOfAvailableCPUs)
-remoteWrite.rateLimit array
Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.relabelConfig string
Optional path to file with relabel_config entries. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details
Optional path to file with relabel_config entries. These entries are applied to all the metrics before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details
-remoteWrite.relabelDebug
Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs
-remoteWrite.roundDigits array
Round metric values to this number of decimal digits after the point before writing them to remote storage. Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. This option may be used for improving data compression for the stored metrics
Supports array of values separated by comma or specified via multiple flags.
@@ -614,31 +746,36 @@ See the docs at https://victoriametrics.github.io/vmagent.html .
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsInsecureSkipVerify array
Whether to skip tls verification when connecting to -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsKeyFile array
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tlsServerName array
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.tmpDataPath string
Path to directory where temporary data for remote write component is stored (default "vmagent-remotewrite-data")
Path to directory where temporary data for remote write component is stored. See also -remoteWrite.maxDiskUsagePerURL (default "vmagent-remotewrite-data")
-remoteWrite.url array
Remote storage URL to write data to. It must support Prometheus remote_write API. It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems
Supports array of values separated by comma or specified via multiple flags.
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.urlRelabelConfig array
Optional path to relabel config for the corresponding -remoteWrite.url
Supports an array of values separated by comma or specified via multiple flags.
-remoteWrite.urlRelabelDebug array
Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. This is useful for debugging the relabeling configs
Supports array of values separated by comma or specified via multiple flags.
-sortLabels
Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}Enabled sorting for labels can slow down ingestion performance a bit
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version

View File

@@ -96,7 +96,8 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
if !*skipMeasurement {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
}
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
skipFieldKey := len(r.Measurement) > 0 && len(r.Fields) == 1 && *skipSingleField
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
}

View File

@@ -23,6 +23,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/influxutils"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
@@ -40,7 +41,7 @@ var (
"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<vmagent>:8429/write`")
"This flag isn't needed when ingesting data over HTTP - just send it to http://<vmagent>:8429/write")
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
@@ -144,7 +145,18 @@ func main() {
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
fmt.Fprintf(w, "vmagent - see docs at https://victoriametrics.github.io/vmagent.html")
if r.Method != "GET" {
return false
}
fmt.Fprintf(w, "<h2>vmagent</h2>")
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/vmagent.html'>https://docs.victoriametrics.com/vmagent.html</a></br>")
fmt.Fprintf(w, "Useful endpoints:</br>")
httpserver.WriteAPIHelp(w, [][2]string{
{"/targets", "discovered targets list"},
{"/api/v1/targets", "advanced information about discovered targets in JSON format"},
{"/metrics", "available service metrics"},
{"/-/reload", "reload configuration"},
})
return true
}
path := strings.Replace(r.URL.Path, "//", "/", -1)
@@ -153,7 +165,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
prometheusWriteRequests.Inc()
if err := promremotewrite.InsertHandler(r); err != nil {
prometheusWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -162,7 +174,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
vmimportRequests.Inc()
if err := vmimport.InsertHandler(r); err != nil {
vmimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -171,7 +183,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -180,7 +192,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
prometheusimportRequests.Inc()
if err := prometheusimport.InsertHandler(r); err != nil {
prometheusimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -189,7 +201,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
nativeimportRequests.Inc()
if err := native.InsertHandler(r); err != nil {
nativeimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -198,16 +210,14 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
influxWriteRequests.Inc()
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/query":
// Emulate fake response for influx query.
// This is required for TSBS benchmark.
influxQueryRequests.Inc()
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
influxutils.WriteDatabaseNames(w)
return true
case "/targets":
promscrapeTargetsRequests.Inc()
@@ -269,7 +279,7 @@ func usage() {
const s = `
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmagent.html .
See the docs at https://docs.victoriametrics.com/vmagent.html .
`
flagutil.Usage(s)
}

View File

@@ -2,8 +2,6 @@ package remotewrite
import (
"bytes"
"crypto/tls"
"encoding/base64"
"fmt"
"io/ioutil"
"net/http"
@@ -42,17 +40,35 @@ var (
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthPasswordFile = flagutil.NewArray("remoteWrite.basicAuth.passwordFile", "Optional path to basic auth password to use for -remoteWrite.url. "+
"The file is re-read every second. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
bearerTokenFile = flagutil.NewArray("remoteWrite.bearerTokenFile", "Optional path to bearer token file to use for -remoteWrite.url. "+
"The token is re-read from the file every second. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
oauth2ClientID = flagutil.NewArray("remoteWrite.oauth2.clientID", "Optional OAuth2 clientID to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
oauth2ClientSecret = flagutil.NewArray("remoteWrite.oauth2.clientSecret", "Optional OAuth2 clientSecret to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
oauth2ClientSecretFile = flagutil.NewArray("remoteWrite.oauth2.clientSecretFile", "Optional OAuth2 clientSecretFile to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
oauth2TokenURL = flagutil.NewArray("remoteWrite.oauth2.tokenUrl", "Optional OAuth2 tokenURL to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
oauth2Scopes = flagutil.NewArray("remoteWrite.oauth2.scopes", "Optional OAuth2 scopes to use for -remoteWrite.url. Scopes must be delimited by ';'. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
)
type client struct {
sanitizedURL string
remoteWriteURL string
authHeader string
fq *persistentqueue.FastQueue
hc *http.Client
authCfg *promauth.Config
rl rateLimiter
bytesSent *metrics.Counter
@@ -68,10 +84,11 @@ type client struct {
}
func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
tlsCfg, err := getTLSConfig(argIdx)
authCfg, err := getAuthConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
logger.Panicf("FATAL: cannot initialize auth config: %s", err)
}
tlsCfg := authCfg.NewTLSConfig()
tr := &http.Transport{
Dial: statDial,
TLSClientConfig: tlsCfg,
@@ -92,26 +109,10 @@ func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqu
}
tr.Proxy = http.ProxyURL(urlProxy)
}
authHeader := ""
username := basicAuthUsername.GetOptionalArg(argIdx)
password := basicAuthPassword.GetOptionalArg(argIdx)
if len(username) > 0 || len(password) > 0 {
// See https://en.wikipedia.org/wiki/Basic_access_authentication
token := username + ":" + password
token64 := base64.StdEncoding.EncodeToString([]byte(token))
authHeader = "Basic " + token64
}
token := bearerToken.GetOptionalArg(argIdx)
if len(token) > 0 {
if authHeader != "" {
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
}
authHeader = "Bearer " + token
}
c := &client{
sanitizedURL: sanitizedURL,
remoteWriteURL: remoteWriteURL,
authHeader: authHeader,
authCfg: authCfg,
fq: fq,
hc: &http.Client{
Transport: tr,
@@ -149,23 +150,48 @@ func (c *client) MustStop() {
logger.Infof("stopped client for -remoteWrite.url=%q", c.sanitizedURL)
}
func getTLSConfig(argIdx int) (*tls.Config, error) {
c := &promauth.TLSConfig{
func getAuthConfig(argIdx int) (*promauth.Config, error) {
username := basicAuthUsername.GetOptionalArg(argIdx)
password := basicAuthPassword.GetOptionalArg(argIdx)
passwordFile := basicAuthPasswordFile.GetOptionalArg(argIdx)
var basicAuthCfg *promauth.BasicAuthConfig
if username != "" || password != "" || passwordFile != "" {
basicAuthCfg = &promauth.BasicAuthConfig{
Username: username,
Password: password,
PasswordFile: passwordFile,
}
}
token := bearerToken.GetOptionalArg(argIdx)
tokenFile := bearerTokenFile.GetOptionalArg(argIdx)
var oauth2Cfg *promauth.OAuth2Config
clientSecret := oauth2ClientSecret.GetOptionalArg(argIdx)
clientSecretFile := oauth2ClientSecretFile.GetOptionalArg(argIdx)
if clientSecretFile != "" || clientSecret != "" {
oauth2Cfg = &promauth.OAuth2Config{
ClientID: oauth2ClientID.GetOptionalArg(argIdx),
ClientSecret: clientSecret,
ClientSecretFile: clientSecretFile,
TokenURL: oauth2TokenURL.GetOptionalArg(argIdx),
Scopes: strings.Split(oauth2Scopes.GetOptionalArg(argIdx), ";"),
}
}
tlsCfg := &promauth.TLSConfig{
CAFile: tlsCAFile.GetOptionalArg(argIdx),
CertFile: tlsCertFile.GetOptionalArg(argIdx),
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
ServerName: tlsServerName.GetOptionalArg(argIdx),
InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(argIdx),
}
if c.CAFile == "" && c.CertFile == "" && c.KeyFile == "" && c.ServerName == "" && !c.InsecureSkipVerify {
return nil, nil
}
cfg, err := promauth.NewConfig(".", nil, "", "", c)
authCfg, err := promauth.NewConfig(".", nil, basicAuthCfg, token, tokenFile, oauth2Cfg, tlsCfg)
if err != nil {
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
return nil, fmt.Errorf("cannot populate OAuth2 config for remoteWrite idx: %d, err: %w", argIdx, err)
}
tlsCfg := cfg.NewTLSConfig()
return tlsCfg, nil
return authCfg, nil
}
func (c *client) runWorker() {
@@ -226,8 +252,8 @@ again:
h.Set("Content-Type", "application/x-protobuf")
h.Set("Content-Encoding", "snappy")
h.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authHeader != "" {
req.Header.Set("Authorization", c.authHeader)
if ah := c.authCfg.GetAuthHeader(); ah != "" {
req.Header.Set("Authorization", ah)
}
startTime := time.Now()
@@ -239,7 +265,7 @@ again:
if retryDuration > time.Minute {
retryDuration = time.Minute
}
logger.Errorf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
logger.Warnf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
len(block), c.sanitizedURL, err, retryDuration.Seconds())
t := timerpool.Get(retryDuration)
select {
@@ -259,13 +285,11 @@ again:
return true
}
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
if statusCode == 409 {
// Just drop block on 409 status code like Prometheus does.
if statusCode == 409 || statusCode == 400 {
// Just drop block on 409 and 400 status codes like Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
body, _ := ioutil.ReadAll(resp.Body)
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149
_ = resp.Body.Close()
logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+
"response body=%q", len(block), c.sanitizedURL, statusCode, body)
c.packetsDropped.Inc()
return true
}

View File

@@ -27,6 +27,9 @@ var (
// the maximum number of rows to send per each block.
const maxRowsPerBlock = 10000
// the maximum number of labels to send per each block.
const maxLabelsPerBlock = 10 * maxRowsPerBlock
type pendingSeries struct {
mu sync.Mutex
wr writeRequest
@@ -153,7 +156,7 @@ func (wr *writeRequest) push(src []prompbmarshal.TimeSeries) {
for i := range src {
tssDst = append(tssDst, prompbmarshal.TimeSeries{})
wr.copyTimeSeries(&tssDst[len(tssDst)-1], &src[i])
if len(wr.samples) >= maxRowsPerBlock {
if len(wr.samples) >= maxRowsPerBlock || len(wr.labels) >= maxLabelsPerBlock {
wr.tss = tssDst
wr.flush()
tssDst = wr.tss

View File

@@ -14,10 +14,15 @@ import (
var (
unparsedLabelsGlobal = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
"Pass multiple -remoteWrite.label flags in order to add multiple labels to metrics before sending them to remote storage")
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
"before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details")
"before sending them to -remoteWrite.url. See https://docs.victoriametrics.com/vmagent.html#relabeling for details")
relabelDebugGlobal = flag.Bool("remoteWrite.relabelDebug", false, "Whether to log metrics before and after relabeling with -remoteWrite.relabelConfig. "+
"If the -remoteWrite.relabelDebug is enabled, then the metrics aren't sent to remote storage. This is useful for debugging the relabeling configs")
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
relabelDebug = flagutil.NewArrayBool("remoteWrite.urlRelabelDebug", "Whether to log metrics before and after relabeling with -remoteWrite.urlRelabelConfig. "+
"If the -remoteWrite.urlRelabelDebug is enabled, then the metrics aren't sent to the corresponding -remoteWrite.url. "+
"This is useful for debugging the relabeling configs")
)
var labelsGlobal []prompbmarshal.Label
@@ -31,7 +36,7 @@ func CheckRelabelConfigs() error {
func loadRelabelConfigs() (*relabelConfigs, error) {
var rcs relabelConfigs
if *relabelConfigPathGlobal != "" {
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal, *relabelDebugGlobal)
if err != nil {
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
}
@@ -47,7 +52,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
// Skip empty relabel config.
continue
}
prc, err := promrelabel.LoadRelabelConfigs(path)
prc, err := promrelabel.LoadRelabelConfigs(path, relabelDebug.GetOptionalArg(i))
if err != nil {
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
}

View File

@@ -3,9 +3,13 @@ package remotewrite
import (
"flag"
"fmt"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -13,6 +17,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/metrics"
xxhash "github.com/cespare/xxhash/v2"
)
@@ -21,9 +26,10 @@ var (
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
queues = flag.Int("remoteWrite.queues", 4, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
"isn't enough for sending high volume of collected data to remote storage")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+
"See also -remoteWrite.maxDiskUsagePerURL")
queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs()*2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
"isn't enough for sending high volume of collected data to remote storage. Default value if 2 * numberOfAvailableCPUs")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
maxPendingBytesPerURL = flagutil.NewBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
@@ -37,6 +43,14 @@ var (
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
"By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
"This option may be used for improving data compression for the stored metrics")
sortLabels = flag.Bool("sortLabels", false, `Whether to sort labels for incoming samples before writing them to all the configured remote storage systems. `+
`This may be needed for reducing memory usage at remote storage when the order of labels in incoming samples is random. `+
`For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}`+
`Enabled sorting for labels can slow down ingestion performance a bit`)
maxHourlySeries = flag.Int("remoteWrite.maxHourlySeries", 0, "The maximum number of unique series vmagent can send to remote storage systems during the last hour. "+
"Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -remoteWrite.maxDailySeries")
maxDailySeries = flag.Int("remoteWrite.maxDailySeries", 0, "The maximum number of unique series vmagent can send to remote storage systems during the last 24 hours. "+
"Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -remoteWrite.maxHourlySeries")
)
var rwctxs []*remoteWriteCtx
@@ -46,7 +60,7 @@ var allRelabelConfigs atomic.Value
// maxQueues limits the maximum value for `-remoteWrite.queues`. There is no sense in setting too high value,
// since it may lead to high memory usage due to big number of buffers.
var maxQueues = cgroup.AvailableCPUs() * 4
var maxQueues = cgroup.AvailableCPUs() * 16
// InitSecretFlags must be called after flag.Parse and before any logging.
func InitSecretFlags() {
@@ -65,6 +79,24 @@ func Init() {
if len(*remoteWriteURLs) == 0 {
logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set")
}
if *maxHourlySeries > 0 {
hourlySeriesLimiter = bloomfilter.NewLimiter(*maxHourlySeries, time.Hour)
_ = metrics.NewGauge(`vmagent_hourly_series_limit_max_series`, func() float64 {
return float64(hourlySeriesLimiter.MaxItems())
})
_ = metrics.NewGauge(`vmagent_hourly_series_limit_current_series`, func() float64 {
return float64(hourlySeriesLimiter.CurrentItems())
})
}
if *maxDailySeries > 0 {
dailySeriesLimiter = bloomfilter.NewLimiter(*maxDailySeries, 24*time.Hour)
_ = metrics.NewGauge(`vmagent_daily_series_limit_max_series`, func() float64 {
return float64(dailySeriesLimiter.MaxItems())
})
_ = metrics.NewGauge(`vmagent_daily_series_limit_current_series`, func() float64 {
return float64(dailySeriesLimiter.CurrentItems())
})
}
if *queues > maxQueues {
*queues = maxQueues
}
@@ -72,6 +104,12 @@ func Init() {
*queues = 1
}
initLabelsGlobal()
// Register SIGHUP handler for config reload before loadRelabelConfigs.
// This guarantees that the config will be re-read if the signal arrives just after loadRelabelConfig.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
sighupCh := procutil.NewSighupChan()
rcs, err := loadRelabelConfigs()
if err != nil {
logger.Fatalf("cannot load relabel configs: %s", err)
@@ -79,11 +117,11 @@ func Init() {
allRelabelConfigs.Store(rcs)
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
if maxInmemoryBlocks > 200 {
if maxInmemoryBlocks > 400 {
// There is no much sense in keeping higher number of blocks in memory,
// since this means that the producer outperforms consumer and the queue
// will continue growing. It is better storing the queue to file.
maxInmemoryBlocks = 200
maxInmemoryBlocks = 400
}
if maxInmemoryBlocks < 2 {
maxInmemoryBlocks = 2
@@ -98,7 +136,6 @@ func Init() {
}
// Start config reloader.
sighupCh := procutil.NewSighupChan()
configReloaderWG.Add(1)
go func() {
defer configReloaderWG.Done()
@@ -150,11 +187,13 @@ func Push(wr *prompbmarshal.WriteRequest) {
for len(tss) > 0 {
// Process big tss in smaller blocks in order to reduce the maximum memory usage
samplesCount := 0
labelsCount := 0
i := 0
for i < len(tss) {
samplesCount += len(tss[i].Samples)
labelsCount += len(tss[i].Labels)
i++
if samplesCount > maxRowsPerBlock {
if samplesCount >= maxRowsPerBlock || labelsCount >= maxLabelsPerBlock {
break
}
}
@@ -170,8 +209,12 @@ func Push(wr *prompbmarshal.WriteRequest) {
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, pcsGlobal)
globalRelabelMetricsDropped.Add(tssBlockLen - len(tssBlock))
}
for _, rwctx := range rwctxs {
rwctx.Push(tssBlock)
sortLabelsIfNeeded(tssBlock)
tssBlock = limitSeriesCardinality(tssBlock)
if len(tssBlock) > 0 {
for _, rwctx := range rwctxs {
rwctx.Push(tssBlock)
}
}
if rctx != nil {
rctx.reset()
@@ -182,6 +225,87 @@ func Push(wr *prompbmarshal.WriteRequest) {
}
}
// sortLabelsIfNeeded sorts labels if -sortLabels command-line flag is set.
func sortLabelsIfNeeded(tss []prompbmarshal.TimeSeries) {
if !*sortLabels {
return
}
for i := range tss {
promrelabel.SortLabels(tss[i].Labels)
}
}
func limitSeriesCardinality(tss []prompbmarshal.TimeSeries) []prompbmarshal.TimeSeries {
if hourlySeriesLimiter == nil && dailySeriesLimiter == nil {
return tss
}
dst := make([]prompbmarshal.TimeSeries, 0, len(tss))
for i := range tss {
labels := tss[i].Labels
h := getLabelsHash(labels)
if hourlySeriesLimiter != nil && !hourlySeriesLimiter.Add(h) {
hourlySeriesLimitRowsDropped.Add(len(tss[i].Samples))
logSkippedSeries(labels, "-remoteWrite.maxHourlySeries", hourlySeriesLimiter.MaxItems())
continue
}
if dailySeriesLimiter != nil && !dailySeriesLimiter.Add(h) {
dailySeriesLimitRowsDropped.Add(len(tss[i].Samples))
logSkippedSeries(labels, "-remoteWrite.maxDailySeries", dailySeriesLimiter.MaxItems())
continue
}
dst = append(dst, tss[i])
}
return dst
}
var (
hourlySeriesLimiter *bloomfilter.Limiter
dailySeriesLimiter *bloomfilter.Limiter
hourlySeriesLimitRowsDropped = metrics.NewCounter(`vmagent_hourly_series_limit_rows_dropped_total`)
dailySeriesLimitRowsDropped = metrics.NewCounter(`vmagent_daily_series_limit_rows_dropped_total`)
)
func getLabelsHash(labels []prompbmarshal.Label) uint64 {
bb := labelsHashBufPool.Get()
b := bb.B[:0]
for _, label := range labels {
b = append(b, label.Name...)
b = append(b, label.Value...)
}
h := xxhash.Sum64(b)
bb.B = b
labelsHashBufPool.Put(bb)
return h
}
var labelsHashBufPool bytesutil.ByteBufferPool
func logSkippedSeries(labels []prompbmarshal.Label, flagName string, flagValue int) {
select {
case <-logSkippedSeriesTicker.C:
logger.Warnf("skip series %s because %s=%d reached", labelsToString(labels), flagName, flagValue)
default:
}
}
var logSkippedSeriesTicker = time.NewTicker(5 * time.Second)
func labelsToString(labels []prompbmarshal.Label) string {
var b []byte
b = append(b, '{')
for i, label := range labels {
b = append(b, label.Name...)
b = append(b, '=')
b = strconv.AppendQuote(b, label.Value)
if i+1 < len(labels) {
b = append(b, ',')
}
}
b = append(b, '}')
return string(b)
}
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
type remoteWriteCtx struct {
@@ -207,7 +331,13 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int,
c := newClient(argIdx, remoteWriteURL, sanitizedURL, fq, *queues)
sf := significantFigures.GetOptionalArgOrDefault(argIdx, 0)
rd := roundDigits.GetOptionalArgOrDefault(argIdx, 100)
pss := make([]*pendingSeries, *queues)
pssLen := *queues
if n := cgroup.AvailableCPUs(); pssLen > n {
// There is no sense in running more than availableCPUs concurrent pendingSeries,
// since every pendingSeries can saturate up to a single CPU.
pssLen = n
}
pss := make([]*pendingSeries, pssLen)
for i := range pss {
pss[i] = newPendingSeries(fq.MustWriteBlock, sf, rd)
}

View File

@@ -1,9 +1,7 @@
package remotewrite
import (
"fmt"
"net"
"strings"
"sync/atomic"
"time"
@@ -11,13 +9,8 @@ import (
"github.com/VictoriaMetrics/metrics"
)
func statDial(network, addr string) (conn net.Conn, err error) {
if !strings.HasPrefix(network, "tcp") {
return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network)
}
if !netutil.TCP6Enabled() {
network = "tcp4"
}
func statDial(networkUnused, addr string) (conn net.Conn, err error) {
network := netutil.GetTCPNetwork()
conn, err = net.DialTimeout(network, addr, 5*time.Second)
dialsTotal.Inc()
if err != nil {

View File

@@ -66,7 +66,17 @@ run-vmalert: vmalert
-remoteRead.url=http://localhost:8428 \
-external.label=cluster=east-1 \
-external.label=replica=a \
-evaluationInterval=3s
-evaluationInterval=3s \
-rule.configCheckInterval=10s
replay-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/config/testdata/rules-replay-good.rules \
-datasource.url=http://localhost:8428 \
-remoteWrite.url=http://localhost:8428 \
-external.label=cluster=east-1 \
-external.label=replica=a \
-replay.timeFrom=2021-05-11T07:21:43Z \
-replay.timeTo=2021-05-29T18:40:43Z
vmalert-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmalert-local-with-goarch
@@ -88,3 +98,9 @@ vmalert-local-with-goarch:
vmalert-pure:
APP_NAME=vmalert $(MAKE) app-local-pure
vmalert-windows-amd64:
GOARCH=amd64 APP_NAME=vmalert $(MAKE) app-local-windows-with-goarch
vmalert-windows-amd64-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-windows-amd64

View File

@@ -1,22 +1,24 @@
## vmalert
# vmalert
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
`vmalert` executes a list of the given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
rules against configured address.
rules against configured address. It is heavily inspired by [Prometheus](https://prometheus.io/docs/alerting/latest/overview/)
implementation and aims to be compatible with its syntax.
### Features:
## Features
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
* VictoriaMetrics [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html)
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Keeps the alerts [state on restarts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmalert#alerts-state-on-restarts);
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite) for details.
* Keeps the alerts [state on restarts](#alerts-state-on-restarts);
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite);
* Recording and Alerting rules backfilling (aka `replay`). See [these docs](#rules-backfilling);
* Lightweight without extra dependencies.
### Limitations:
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
## Limitations
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
may fail;
* by default, rules execution is sequential within one group, but persisting of execution results to remote
@@ -24,7 +26,7 @@ storage is asynchronous. Hence, user shouldn't rely on recording rules chaining
recording rule is reused in next one;
* `vmalert` has no UI, just an API for getting groups and rules statuses.
### QuickStart
## QuickStart
To build `vmalert` from sources:
```
@@ -37,75 +39,87 @@ The build binary will be placed to `VictoriaMetrics/bin` folder.
To start using `vmalert` you will need the following things:
* list of rules - PromQL/MetricsQL expressions to execute;
* datasource address - reachable VictoriaMetrics instance for rules execution;
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
aggregating alerts and sending notifications.
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
* remote write address [optional] - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
compatible storage address for storing recording rules results and alerts state in for of timeseries.
Then configure `vmalert` accordingly:
```
./bin/vmalert -rule=alert.rules \
./bin/vmalert -rule=alert.rules \ # Path to the file with rules configuration. Supports wildcard
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
-notifier.url=http://localhost:9093 \ # AlertManager URL
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
-remoteWrite.url=http://localhost:8428 \ # Remote write compatible storage to persist rules
-remoteRead.url=http://localhost:8428 \ # MetricsQL compatible datasource to restore alerts state from
-external.label=cluster=east-1 \ # External label to be applied for each rule
-external.label=replica=a \ # Multiple external labels may be set
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
-external.label=replica=a # Multiple external labels may be set
```
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
See the fill list of configuration flags in [configuration](#configuration) section.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
Every `rule` belongs to a `group` and every configuration file may contain arbitrary number of groups:
```yaml
groups:
[ - <rule_group> ]
```
#### Groups
### Groups
Each group has following attributes:
Each group has the following attributes:
```yaml
# The name of the group. Must be unique within a file.
name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = global.evaluation_interval ]
[ interval: <duration> | default = -evaluationInterval flag ]
# How many rules execute at once. Increasing concurrency may speed
# up round execution speed.
# How many rules execute at once within a group. Increasing concurrency may speed
# up round execution speed.
[ concurrency: <integer> | default = 1 ]
# Optional type for expressions inside the rules. Supported values: "graphite" and "prometheus".
# By default "prometheus" rule type is used.
[ type: <string> ]
# Optional list of label filters applied to every rule's
# request withing a group. Is compatible only with VM datasource.
# See more details at https://docs.victoriametrics.com#prometheus-querying-api-enhancements
extra_filter_labels:
[ <labelname>: <labelvalue> ... ]
rules:
[ - <rule> ... ]
```
#### Rules
### Rules
Every rule contains `expr` field for [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/)
or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expression. Vmalert will execute the configured
expression and then act according to the Rule type.
There are two types of Rules:
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allow you to precompute frequently needed or computationally expensive expressions
and save their result as a new set of time series.
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via `expr` field and to send notifications
[Alertmanager](https://github.com/prometheus/alertmanager) if execution result is not empty.
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allows to define `expr` which result will be than backfilled to configured
`-remoteWrite.url`. Recording rules are used to precompute frequently needed or computationally
expensive expressions and save their result as a new set of time series.
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
within one group.
within one group.
##### Alerting rules
#### Alerting rules
The syntax for alerting rule is following:
The syntax for alerting rule is the following:
```yaml
# The name of the alert. Must be a valid metric name.
alert: <string>
@@ -115,12 +129,14 @@ alert: <string>
[ type: <string> ]
# The expression to evaluate. The expression language depends on the type value.
# By default MetricsQL expression is used. If type="graphite", then the expression
# By default PromQL/MetricsQL expression is used. If type="graphite", then the expression
# must contain valid Graphite expression.
expr: <string>
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
# as firing once they return.
[ for: <duration> | default = 0s ]
# Labels to add or overwrite for each alert.
@@ -130,9 +146,14 @@ labels:
# Annotations to add to each alert.
annotations:
[ <labelname>: <tmpl_string> ]
```
```
##### Recording rules
It is allowed to use [Go templating](https://golang.org/pkg/text/template/) in annotations
to format data, iterate over it or execute expressions.
Additionally, `vmalert` provides some extra templating functions
listed [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/notifier/template_func.go).
#### Recording rules
The syntax for recording rules is following:
```yaml
@@ -153,37 +174,77 @@ labels:
[ <labelname>: <labelvalue> ]
```
For recording rules to work `-remoteWrite.url` must specified.
For recording rules to work `-remoteWrite.url` must be specified.
#### Alerts state on restarts
### Alerts state on restarts
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert`
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after restart of `vmalert`
the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags:
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or VMInsert (Cluster). `vmalert` will persist alerts state
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
These are regular time series and may be queried from VM just as any other time series.
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
These are regular time series and may be queried from VM just as any other time series.
The state stored to the configured address on every rule evaluation.
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or VMSelect (Cluster). `vmalert` will try to restore alerts state
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or vmselect (Cluster). `vmalert` will try to restore alerts state
from configured address by querying time series with name `ALERTS_FOR_STATE`.
Both flags are required for the proper state restoring. Restore process may fail if time series are missing
in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert`
rules configuration.
in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`)
or received state doesn't match current `vmalert` rules configuration.
#### WEB
### Multitenancy
There are the following approaches for alerting and recording rules across
[multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy):
* To run a separate `vmalert` instance per each tenant.
The corresponding tenant must be specified in `-datasource.url` command-line flag
according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format).
For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus`
would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line
flag must contain the url for the specific tenant as well.
For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording
rules to `AccountID=123`.
* To specify `tenant` parameter per each alerting and recording group if
[enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used
with `-clusterMode` command-line flag. For example:
```yaml
groups:
- name: rules_for_tenant_123
tenant: "123"
rules:
# Rules for accountID=123
- name: rules_for_tenant_456:789
tenant: "456:789"
rules:
# Rules for accountID=456, projectID=789
```
If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must
contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481`.
`vmselect` automatically adds the specified tenant to urls per each recording rule in this case.
The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files
at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise`
tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags).
### WEB
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
* `http://<vmalert-addr>/api/v1/<groupID>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
### Graphite
## Graphite
vmalert sends requests to `<-datasource.url>/render?format=json` during evaluation of alerting and recording rules
if the corresponding group or rule contains `type: "graphite"` config option. It is expected that the `<-datasource.url>/render`
@@ -191,229 +252,335 @@ implements [Graphite Render API](https://graphite.readthedocs.io/en/stable/rende
When using vmalert with both `graphite` and `prometheus` rules configured against cluster version of VM do not forget
to set `-datasource.appendTypePrefix` flag to `true`, so vmalert can adjust URL prefix automatically based on query type.
## Rules backfilling
### Configuration
vmalert supports alerting and recording rules backfilling (aka `replay`). In replay mode vmalert
can read the same rules configuration as normally, evaluate them on the given time range and backfill
results via remote write to the configured storage. vmalert supports any PromQL/MetricsQL compatible
data source for backfilling.
### How it works
In `replay` mode vmalert works as a cli-tool and exits immediately after work is done.
To run vmalert in `replay` mode:
```
./bin/vmalert -rule=path/to/your.rules \ # path to files with rules you usually use with vmalert
-datasource.url=http://localhost:8428 \ # PromQL/MetricsQL compatible datasource
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist results
-replay.timeFrom=2021-05-11T07:21:43Z \ # time from begin replay
-replay.timeTo=2021-05-29T18:40:43Z # time to finish replay
```
The output of the command will look like the following:
```
Replay mode:
from: 2021-05-11 07:21:43 +0000 UTC # set by -replay.timeFrom
to: 2021-05-29 18:40:43 +0000 UTC # set by -replay.timeTo
max data points per request: 1000 # set by -replay.maxDatapointsPerQuery
Group "ReplayGroup"
interval: 1m0s
requests to make: 27
max range per request: 16h40m0s
> Rule "type:vm_cache_entries:rate5m" (ID: 1792509946081842725)
27 / 27 [----------------------------------------------------------------------------------------------------] 100.00% 78 p/s
> Rule "go_cgo_calls_count:rate5m" (ID: 17958425467471411582)
27 / 27 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
Group "vmsingleReplay"
interval: 30s
requests to make: 54
max range per request: 8h20m0s
> Rule "RequestErrorsToAPI" (ID: 17645863024999990222)
54 / 54 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
> Rule "TooManyLogs" (ID: 9042195394653477652)
54 / 54 [-----------------------------------------------------------------------------------------------------] 100.00% ? p/s
2021-06-07T09:59:12.098Z info app/vmalert/replay.go:68 replay finished! Imported 511734 samples
```
In `replay` mode all groups are executed sequentially one-by-one. Rules within the group are
executed sequentially as well (`concurrency` setting is ignored). Vmalert sends rule's expression
to [/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) endpoint
of the configured `-datasource.url`. Returned data then processed according to the rule type and
backfilled to `-remoteWrite.url` via [Remote Write protocol](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations).
Vmalert respects `evaluationInterval` value set by flag or per-group during the replay.
#### Recording rules
Result of recording rules `replay` should match with results of normal rules evaluation.
#### Alerting rules
Result of alerting rules `replay` is time series reflecting [alert's state](#alerts-state-on-restarts).
To see if `replayed` alert has fired in the past use the following PromQL/MetricsQL expression:
```
ALERTS{alertname="your_alertname", alertstate="firing"}
```
Execute the query against storage which was used for `-remoteWrite.url` during the `replay`.
### Additional configuration
There are following non-required `replay` flags:
* `-replay.maxDatapointsPerQuery` - the max number of data points expected to receive in one request.
In two words, it affects the max time range for every `/query_range` request. The higher the value,
the less requests will be issued during `replay`.
* `-replay.ruleRetryAttempts` - when datasource fails to respond vmalert will make this number of retries
per rule before giving up.
* `-replay.rulesDelay` - delay between sequential rules execution. Important in cases if there are chaining
(rules which depend on each other) rules. It is expected, that remote storage will be able to persist
previously accepted data during the delay, so data will be available for the subsequent queries.
Keep it equal or bigger than `-remoteWrite.flushInterval`.
See full description for these flags in `./vmalert --help`.
### Limitations
* Graphite engine isn't supported yet;
* `query` template function is disabled for performance reasons (might be changed in future);
## Configuration
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
The shortlist of configuration flags is the following:
```
-datasource.appendTypePrefix
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to VMSelect URL.
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
-datasource.basicAuth.password string
Optional basic auth password for -datasource.url
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
Optional basic auth username for -datasource.url
-datasource.lookback duration
Lookback defines how far to look into past when evaluating queries. For example, if datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
-datasource.maxIdleConnections int
Defines the number of idle (keep-alive connections) to configured datasource.Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state. (default 100)
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
-datasource.queryStep duration
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.If queryStep isn't specified, rule's evaluationInterval will be used instead.
-datasource.roundDigits int
Adds "round_digits" GET param to datasource requests. In VM "round_digits" limits the number of digits after the decimal point in response values.
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
-datasource.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
-datasource.tlsServerName string
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
-dryRun -rule
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
Prefix for environment variables if -envflag.enable is set
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
How often to evaluate the rules (default 1m0s)
-external.alert.source string
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
-external.label array
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
Supports array of values separated by comma or specified via multiple flags.
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
Supports an array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier
External URL is used as alert's source for sent alerts to the notifier
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
Address to listen for http connections (default ":8880")
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
Output for the logs. Supported values: stderr, stdout (default "stderr")
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
Auth key for /metrics. It overrides httpAuth settings
-notifier.basicAuth.password array
Optional basic auth password for -notifier.url
Supports array of values separated by comma or specified via multiple flags.
Optional basic auth password for -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.basicAuth.username array
Optional basic auth username for -notifier.url
Supports array of values separated by comma or specified via multiple flags.
Optional basic auth username for -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
Supports array of values separated by comma or specified via multiple flags.
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsInsecureSkipVerify array
Whether to skip tls verification when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
Whether to skip tls verification when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsKeyFile array
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
Supports an array of values separated by comma or specified via multiple flags.
-notifier.tlsServerName array
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
Supports array of values separated by comma or specified via multiple flags.
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
Supports an array of values separated by comma or specified via multiple flags.
-notifier.url array
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
Supports array of values separated by comma or specified via multiple flags.
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
Supports an array of values separated by comma or specified via multiple flags.
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
Auth key for /debug/pprof. It overrides httpAuth settings
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
Optional basic auth password for -remoteRead.url
-remoteRead.basicAuth.username string
Optional basic auth username for -remoteRead.url
Optional basic auth username for -remoteRead.url
-remoteRead.ignoreRestoreErrors
Whether to ignore errors from remote storage when restoring alerts state on startup. (default true)
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
-remoteRead.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
-remoteRead.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteRead.url
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
-remoteRead.tlsServerName string
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
Optional basic auth username for -remoteWrite.url
-remoteWrite.concurrency int
Defines number of writers for concurrent writing into remote querier (default 1)
Defines number of writers for concurrent writing into remote querier (default 1)
-remoteWrite.flushInterval duration
Defines interval of flushes to remote write endpoint (default 5s)
Defines interval of flushes to remote write endpoint (default 5s)
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
Defines defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
-remoteWrite.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
-remoteWrite.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteWrite.url
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
-remoteWrite.tlsServerName string
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
-replay.maxDatapointsPerQuery int
Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000)
-replay.ruleRetryAttempts int
Defines how many retries to make before giving up on rule if request for it returns an error. (default 5)
-replay.rulesDelay duration
Delay between rules evaluation within the group. Could be important if there are chained rules inside of the groupand processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule. Keep it equal or bigger than -remoteWrite.flushInterval. (default 1s)
-replay.timeFrom string
The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'
-replay.timeTo string
The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'
-rule array
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Supports array of values separated by comma or specified via multiple flags.
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Supports an array of values separated by comma or specified via multiple flags.
-rule.configCheckInterval duration
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Whether to validate annotation and label templates (default true)
Whether to validate annotation and label templates (default true)
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
Show VictoriaMetrics version
```
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
`vmalert` supports "hot" config reload via the following methods:
* send SIGHUP signal to `vmalert` process;
* send GET request to `/-/reload` endpoint;
* configure `-rule.configCheckInterval` flag for periodic reload
on config change.
To reload configuration without `vmalert` restart send SIGHUP signal
or send GET request to `/-/reload` endpoint.
### Contributing
## Contributing
`vmalert` is mostly designed and built by VictoriaMetrics community.
Feel free to share your experience and ideas for improving this
Feel free to share your experience and ideas for improving this
software. Please keep simplicity as the main priority.
### How to build from sources
## How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
- `vmalert` is located in `vmutils-*` archives there.
#### Development build
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmalert` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert` binary and puts it into the `bin` folder.
#### Production build
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-prod` from the root folder of the repository.
2. Run `make vmalert-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert-prod` binary and puts it into the `bin` folder.
#### ARM build
### ARM build
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
#### Development ARM build
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert-arm` or `vmalert-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of the repository.
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmalert-arm-prod` or `vmalert-arm64-prod` binary respectively and puts it into the `bin` folder.

View File

@@ -19,15 +19,18 @@ import (
// AlertingRule is basic alert entity
type AlertingRule struct {
Type datasource.Type
RuleID uint64
Name string
Expr string
For time.Duration
Labels map[string]string
Annotations map[string]string
GroupID uint64
GroupName string
Type datasource.Type
RuleID uint64
Name string
Expr string
For time.Duration
Labels map[string]string
Annotations map[string]string
GroupID uint64
GroupName string
EvalInterval time.Duration
q datasource.Querier
// guard status fields
mu sync.RWMutex
@@ -49,19 +52,25 @@ type alertingRuleMetrics struct {
active *gauge
}
func newAlertingRule(group *Group, cfg config.Rule) *AlertingRule {
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
Type: cfg.Type,
RuleID: cfg.ID,
Name: cfg.Alert,
Expr: cfg.Expr,
For: cfg.For.Duration(),
Labels: cfg.Labels,
Annotations: cfg.Annotations,
GroupID: group.ID(),
GroupName: group.Name,
alerts: make(map[uint64]*notifier.Alert),
metrics: &alertingRuleMetrics{},
Type: cfg.Type,
RuleID: cfg.ID,
Name: cfg.Alert,
Expr: cfg.Expr,
For: cfg.For.Duration(),
Labels: cfg.Labels,
Annotations: cfg.Annotations,
GroupID: group.ID(),
GroupName: group.Name,
EvalInterval: group.Interval,
q: qb.BuildWithParams(datasource.QuerierParams{
DataSourceType: &cfg.Type,
EvaluationInterval: group.Interval,
ExtraLabels: group.ExtraFilterLabels,
}),
alerts: make(map[uint64]*notifier.Alert),
metrics: &alertingRuleMetrics{},
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
@@ -119,10 +128,67 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// ExecRange executes alerting rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
series, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil {
return nil, err
}
var result []prompbmarshal.TimeSeries
qFn := func(query string) ([]datasource.Metric, error) {
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
}
for _, s := range series {
// extra labels could contain templates, so we expand them first
labels, err := expandLabels(s, qFn, ar)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
for k, v := range labels {
// apply extra labels to datasource
// so the hash key will be consistent on restore
s.SetLabel(k, v)
}
a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
if err != nil {
return nil, fmt.Errorf("failed to create alert: %s", err)
}
if ar.For == 0 { // if alert is instant
a.State = notifier.StateFiring
for i := range s.Values {
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
}
continue
}
// if alert with For > 0
prevT := time.Time{}
//activeAt := time.Time{}
for i := range s.Values {
at := time.Unix(s.Timestamps[i], 0)
if at.Sub(prevT) > ar.EvalInterval {
// reset to Pending if there are gaps > EvalInterval between DPs
a.State = notifier.StatePending
//activeAt = at
a.Start = at
} else if at.Sub(a.Start) >= ar.For {
a.State = notifier.StateFiring
}
prevT = at
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
}
}
return result, nil
}
// Exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := q.Query(ctx, ar.Expr, ar.Type)
func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := ar.q.Query(ctx, ar.Expr)
ar.mu.Lock()
defer ar.mu.Unlock()
@@ -139,7 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
}
}
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
@@ -161,9 +227,9 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
if a.Value != m.Value {
if a.Value != m.Values[0] {
// update Value field with latest value
a.Value = m.Value
a.Value = m.Values[0]
// and re-exec template since Value can be used
// in annotations
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
@@ -201,10 +267,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
alertsFired.Inc()
}
}
if series {
return ar.toTimeSeries(ar.lastExecTime), nil
}
return nil, nil
return ar.toTimeSeries(ar.lastExecTime.Unix()), nil
}
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
@@ -214,13 +277,13 @@ func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (ma
}
tpl := notifier.AlertTplData{
Labels: metricLabels,
Value: m.Value,
Value: m.Values[0],
Expr: ar.Expr,
}
return notifier.ExecTemplate(q, ar.Labels, tpl)
}
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
@@ -244,6 +307,8 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
ar.For = nr.For
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.q = nr.q
return nil
}
@@ -271,13 +336,15 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notif
GroupID: ar.GroupID,
Name: ar.Name,
Labels: map[string]string{},
Value: m.Value,
Value: m.Values[0],
Start: start,
Expr: ar.Expr,
}
// label defined here to make override possible by
// time series labels.
a.Labels[alertGroupNameLabel] = ar.GroupName
if ar.GroupName != "" {
a.Labels[alertGroupNameLabel] = ar.GroupName
}
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
@@ -366,7 +433,7 @@ const (
)
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
if ar.For > 0 {
@@ -375,7 +442,7 @@ func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time
return tss
}
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
func alertToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
@@ -383,19 +450,19 @@ func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prom
labels["__name__"] = alertMetricName
labels[alertNameLabel] = name
labels[alertStateLabel] = a.State.String()
return newTimeSeries(1, labels, timestamp)
return newTimeSeries([]float64{1}, []int64{timestamp}, labels)
}
// alertForToTimeSeries returns a timeseries that represents
// state of active alerts, where value is time when alert become active
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertForStateMetricName
labels[alertNameLabel] = name
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
return newTimeSeries([]float64{float64(a.Start.Unix())}, []int64{timestamp}, labels)
}
// Restore restores the state of active alerts basing on previously written timeseries.
@@ -407,7 +474,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
return fmt.Errorf("querier is nil")
}
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
// account for external labels in filter
var labelsFilter string
@@ -420,7 +487,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr, ar.Type)
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
}
@@ -437,7 +504,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
m.Labels = append(m.Labels, l)
}
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0), qFn)
a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
if err != nil {
return fmt.Errorf("failed to create alert: %w", err)
}

View File

@@ -24,11 +24,11 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
newTestAlertingRule("instant", 0),
&notifier.Alert{State: notifier.StateFiring},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant",
}, timestamp),
}),
},
},
{
@@ -38,13 +38,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
"instance": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant extra labels",
"job": "foo",
"instance": "bar",
}, timestamp),
}),
},
},
{
@@ -54,48 +54,52 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
"__name__": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant labels override",
}, timestamp),
}),
},
},
{
newTestAlertingRule("for", time.Second),
&notifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "for",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for",
}, timestamp),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for",
}),
},
},
{
newTestAlertingRule("for pending", 10*time.Second),
&notifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StatePending.String(),
alertNameLabel: "for pending",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for pending",
}, timestamp),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for pending",
}),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
tc.rule.alerts[tc.alert.ID] = tc.alert
tss := tc.rule.toTimeSeries(timestamp)
tss := tc.rule.toTimeSeries(timestamp.Unix())
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
@@ -118,7 +122,7 @@ func TestAlertingRule_Exec(t *testing.T) {
{
newTestAlertingRule("empty labels", 0),
[][]datasource.Metric{
{datasource.Metric{}},
{datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}},
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StateFiring},
@@ -294,11 +298,12 @@ func TestAlertingRule_Exec(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
for _, step := range tc.steps {
fq.reset()
fq.add(step...)
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
if _, err := tc.rule.Exec(context.TODO()); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
@@ -320,6 +325,166 @@ func TestAlertingRule_Exec(t *testing.T) {
}
}
func TestAlertingRule_ExecRange(t *testing.T) {
testCases := []struct {
rule *AlertingRule
data []datasource.Metric
expAlerts []*notifier.Alert
}{
{
newTestAlertingRule("empty", 0),
[]datasource.Metric{},
nil,
},
{
newTestAlertingRule("empty labels", 0),
[]datasource.Metric{
{Values: []float64{1}, Timestamps: []int64{1}},
},
[]*notifier.Alert{
{State: notifier.StateFiring},
},
},
{
newTestAlertingRule("single-firing", 0),
[]datasource.Metric{
metricWithLabels(t, "name", "foo"),
},
[]*notifier.Alert{
{
Labels: map[string]string{"name": "foo"},
State: notifier.StateFiring,
},
},
},
{
newTestAlertingRule("single-firing-on-range", 0),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
},
[]*notifier.Alert{
{State: notifier.StateFiring},
{State: notifier.StateFiring},
{State: notifier.StateFiring},
},
},
{
newTestAlertingRule("for-pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StatePending, Start: time.Unix(3, 0)},
{State: notifier.StatePending, Start: time.Unix(5, 0)},
},
},
{
newTestAlertingRule("for-firing", 3*time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
},
},
{
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}},
},
[]*notifier.Alert{
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
{State: notifier.StatePending, Start: time.Unix(5, 0)},
{State: notifier.StateFiring, Start: time.Unix(5, 0)},
{State: notifier.StatePending, Start: time.Unix(20, 0)},
},
},
{
newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
[]*notifier.Alert{
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StatePending, Start: time.Unix(1, 0)},
{State: notifier.StateFiring, Start: time.Unix(1, 0)},
//
{State: notifier.StatePending, Start: time.Unix(1, 0),
Labels: map[string]string{
"foo": "bar",
}},
{State: notifier.StatePending, Start: time.Unix(5, 0),
Labels: map[string]string{
"foo": "bar",
}},
},
},
{
newTestRuleWithLabels("multi-series-firing", "source", "vm"),
[]datasource.Metric{
{Values: []float64{1, 1}, Timestamps: []int64{1, 100}},
{Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
[]*notifier.Alert{
{State: notifier.StateFiring, Labels: map[string]string{
"source": "vm",
}},
{State: notifier.StateFiring, Labels: map[string]string{
"source": "vm",
}},
//
{State: notifier.StateFiring, Labels: map[string]string{
"foo": "bar",
"source": "vm",
}},
{State: notifier.StateFiring, Labels: map[string]string{
"foo": "bar",
"source": "vm",
}},
},
},
}
fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.data...)
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
var expTS []prompbmarshal.TimeSeries
var j int
for _, series := range tc.data {
for _, timestamp := range series.Timestamps {
expTS = append(expTS, tc.rule.alertToTimeSeries(tc.expAlerts[j], timestamp)...)
j++
}
}
if len(gotTS) != len(expTS) {
t.Fatalf("expected %d time series; got %d", len(expTS), len(gotTS))
}
for i := range expTS {
got, exp := gotTS[i], expTS[i]
if !reflect.DeepEqual(got, exp) {
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
}
}
})
}
}
func TestAlertingRule_Restore(t *testing.T) {
testCases := []struct {
rule *AlertingRule
@@ -410,6 +575,7 @@ func TestAlertingRule_Restore(t *testing.T) {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
t.Fatalf("unexpected err: %s", err)
@@ -437,17 +603,18 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), fq, false)
_, err := ar.Exec(context.TODO())
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), fq, false)
_, err = ar.Exec(context.TODO())
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
@@ -456,7 +623,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), fq, false)
_, err = ar.Exec(context.TODO())
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@@ -481,17 +648,15 @@ func TestAlertingRule_Template(t *testing.T) {
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
Annotations: map[string]string{},
Labels: map[string]string{
alertGroupNameLabel: "",
"region": "east",
"instance": "foo",
"region": "east",
"instance": "foo",
},
},
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
Annotations: map[string]string{},
Labels: map[string]string{
alertGroupNameLabel: "",
"region": "east",
"instance": "bar",
"region": "east",
"instance": "bar",
},
},
},
@@ -516,9 +681,8 @@ func TestAlertingRule_Template(t *testing.T) {
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
Labels: map[string]string{
alertGroupNameLabel: "",
"instance": "foo",
"region": "east",
"instance": "foo",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "foo" for region east`,
@@ -527,9 +691,8 @@ func TestAlertingRule_Template(t *testing.T) {
},
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
Labels: map[string]string{
alertGroupNameLabel: "",
"instance": "bar",
"region": "east",
"instance": "bar",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "bar" for region east`,
@@ -544,8 +707,9 @@ func TestAlertingRule_Template(t *testing.T) {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
fq.add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
if _, err := tc.rule.Exec(context.TODO()); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
@@ -575,5 +739,5 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
}
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor, EvalInterval: waitFor}
}

View File

@@ -8,7 +8,6 @@ import (
"path/filepath"
"sort"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@@ -16,7 +15,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metricsql"
"gopkg.in/yaml.v2"
)
@@ -25,10 +23,14 @@ import (
type Group struct {
Type datasource.Type `yaml:"type,omitempty"`
File string
Name string `yaml:"name"`
Interval time.Duration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
Name string `yaml:"name"`
Interval utils.PromDuration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
// ExtraFilterLabels is a list label filters applied to every rule
// request withing a group. Is compatible only with VM datasources.
// See https://docs.victoriametrics.com#prometheus-querying-api-enhancements
ExtraFilterLabels map[string]string `yaml:"extra_filter_labels"`
// Checksum stores the hash of yaml definition for this group.
// May be used to detect any changes like rules re-ordering etc.
Checksum string
@@ -115,54 +117,18 @@ func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
// recording rule or alerting rule.
type Rule struct {
ID uint64
Type datasource.Type `yaml:"type,omitempty"`
Record string `yaml:"record,omitempty"`
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For PromDuration `yaml:"for"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
Type datasource.Type `yaml:"type,omitempty"`
Record string `yaml:"record,omitempty"`
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For utils.PromDuration `yaml:"for"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// PromDuration is Prometheus duration.
type PromDuration struct {
milliseconds int64
}
// NewPromDuration returns PromDuration for given d.
func NewPromDuration(d time.Duration) PromDuration {
return PromDuration{
milliseconds: d.Milliseconds(),
}
}
// MarshalYAML implements yaml.Marshaler interface.
func (pd PromDuration) MarshalYAML() (interface{}, error) {
return pd.Duration().String(), nil
}
// UnmarshalYAML implements yaml.Unmarshaler interface.
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var s string
if err := unmarshal(&s); err != nil {
return err
}
ms, err := metricsql.DurationValue(s, 0)
if err != nil {
return err
}
pd.milliseconds = ms
return nil
}
// Duration returns duration for pd.
func (pd *PromDuration) Duration() time.Duration {
return time.Duration(pd.milliseconds) * time.Millisecond
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rule Rule

View File

@@ -8,10 +8,9 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"gopkg.in/yaml.v2"
)
func TestMain(m *testing.M) {
@@ -264,7 +263,7 @@ func TestGroup_Validate(t *testing.T) {
Rules: []Rule{
{
Expr: "sumSeries(time('foo.bar',10))",
For: PromDuration{milliseconds: 10},
For: utils.NewPromDuration(10 * time.Millisecond),
},
{
Expr: "sum(up == 0 ) by (host)",
@@ -280,7 +279,7 @@ func TestGroup_Validate(t *testing.T) {
Rules: []Rule{
{
Expr: "sum(up == 0 ) by (host)",
For: PromDuration{milliseconds: 10},
For: utils.NewPromDuration(10 * time.Millisecond),
},
{
Expr: "sumSeries(time('foo.bar',10))",
@@ -348,7 +347,7 @@ func TestHashRule(t *testing.T) {
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", For: NewPromDuration(time.Minute)},
Rule{Alert: "alert", Expr: "up == 1", For: utils.NewPromDuration(time.Minute)},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},

View File

@@ -0,0 +1,15 @@
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: |
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
expr: |
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="openshift-monitoring"})) != 1
for: 5m
labels:
severity: critical

View File

@@ -0,0 +1,39 @@
groups:
- name: ReplayGroup
interval: 1m
concurrency: 1
rules:
- record: type:vm_cache_entries:rate5m
expr: sum(rate(vm_cache_entries[5m])) by (type)
labels:
recording: true
- record: go_cgo_calls_count:rate5m
expr: rate(go_cgo_calls_count{job="vmdb"}[5m])
labels:
recording: true
- name: vmsingleReplay
interval: 30s
concurrency: 2
rules:
- alert: RequestErrorsToAPI
expr: increase(vm_http_request_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
description: "Requests to path {{ $labels.path }} are receiving errors.
Please verify if clients are sending correct requests."
- alert: TooManyLogs
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=67&var-instance={{ $labels.instance }}"
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
Worth to check logs for specific error messages."

View File

@@ -2,6 +2,8 @@ groups:
- name: TestGroup
interval: 2s
concurrency: 2
extra_filter_labels:
job: victoriametrics
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1

View File

@@ -2,21 +2,33 @@ package datasource
import (
"context"
"time"
)
// Querier interface wraps Query method which
// executes given query and returns list of Metrics
// as result
// Querier interface wraps Query and QueryRange methods
type Querier interface {
Query(ctx context.Context, query string, engine Type) ([]Metric, error)
Query(ctx context.Context, query string) ([]Metric, error)
QueryRange(ctx context.Context, query string, from, to time.Time) ([]Metric, error)
}
// QuerierBuilder builds Querier with given params.
type QuerierBuilder interface {
BuildWithParams(params QuerierParams) Querier
}
// QuerierParams params for Querier.
type QuerierParams struct {
DataSourceType *Type
EvaluationInterval time.Duration
// see https://docs.victoriametrics.com/#prometheus-querying-api-enhancements
ExtraLabels map[string]string
}
// Metric is the basic entity which should be return by datasource
// It represents single data point with full list of labels
type Metric struct {
Labels []Label
Timestamp int64
Value float64
Labels []Label
Timestamps []int64
Values []float64
}
// SetLabel adds or updates existing one label

View File

@@ -0,0 +1,18 @@
package datasource
import "testing"
func TestMetric_Label(t *testing.T) {
m := &Metric{}
m.AddLabel("foo", "bar")
checkEqualString(t, "bar", m.Label("foo"))
m.SetLabel("foo", "baz")
checkEqualString(t, "baz", m.Label("foo"))
m.SetLabel("qux", "quux")
checkEqualString(t, "quux", m.Label("qux"))
checkEqualString(t, "", m.Label("non-existing"))
}

View File

@@ -4,43 +4,59 @@ import (
"flag"
"fmt"
"net/http"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to VMSelect URL.")
addr = flag.String("datasource.url", "", "VictoriaMetrics or vmselect url. Required parameter. "+
"E.g. http://127.0.0.1:8428")
appendTypePrefix = flag.Bool("datasource.appendTypePrefix", false, "Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
"By default system CA is used")
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
"By default the server name from -datasource.url is used")
tlsCAFile = flag.String("datasource.tlsCAFile", "", `Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used`)
tlsServerName = flag.String("datasource.tlsServerName", "", `Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used`)
lookBack = flag.Duration("datasource.lookback", 0, "Lookback defines how far to look into past when evaluating queries. "+
"For example, if datasource.lookback=5m then param \"time\" with value now()-5m will be added to every query.")
lookBack = flag.Duration("datasource.lookback", 0, `Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.`)
queryStep = flag.Duration("datasource.queryStep", 0, "queryStep defines how far a value can fallback to when evaluating queries. "+
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query.")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, "Defines the number of idle (keep-alive connections) to configured datasource."+
"Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state.")
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query."+
"If queryStep isn't specified, rule's evaluationInterval will be used instead.")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
roundDigits = flag.Int("datasource.roundDigits", 0, `Adds "round_digits" GET param to datasource requests. `+
`In VM "round_digits" limits the number of digits after the decimal point in response values.`)
)
// Init creates a Querier from provided flag values.
func Init() (Querier, error) {
func Init() (QuerierBuilder, error) {
if *addr == "" {
return nil, fmt.Errorf("datasource.url is empty")
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
tr.MaxIdleConns = *maxIdleConnections
c := &http.Client{Transport: tr}
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, *lookBack, *queryStep, *appendTypePrefix, c), nil
tr.MaxIdleConnsPerHost = *maxIdleConnections
var rd string
if *roundDigits > 0 {
rd = fmt.Sprintf("%d", *roundDigits)
}
return &VMStorage{
c: &http.Client{Transport: tr},
basicAuthUser: *basicAuthUsername,
basicAuthPass: *basicAuthPassword,
datasourceURL: strings.TrimSuffix(*addr, "/"),
appendTypePrefix: *appendTypePrefix,
lookBack: *lookBack,
queryStep: *queryStep,
roundDigits: rd,
dataSourceType: NewPrometheusType(),
}, nil
}

View File

@@ -2,76 +2,13 @@ package datasource
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
"strings"
"time"
)
type response struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result []struct {
Labels map[string]string `json:"metric"`
TV [2]interface{} `json:"value"`
} `json:"result"`
} `json:"data"`
ErrorType string `json:"errorType"`
Error string `json:"error"`
}
func (r response) metrics() ([]Metric, error) {
var ms []Metric
var m Metric
var f float64
var err error
for i, res := range r.Data.Result {
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
}
m.Labels = nil
for k, v := range r.Data.Result[i].Labels {
m.AddLabel(k, v)
}
m.Timestamp = int64(res.TV[0].(float64))
m.Value = f
ms = append(ms, m)
}
return ms, nil
}
type graphiteResponse []graphiteResponseTarget
type graphiteResponseTarget struct {
Target string `json:"target"`
Tags map[string]string `json:"tags"`
DataPoints [][2]float64 `json:"datapoints"`
}
func (r graphiteResponse) metrics() []Metric {
var ms []Metric
for _, res := range r {
if len(res.DataPoints) < 1 {
continue
}
var m Metric
// add only last value to the result.
last := res.DataPoints[len(res.DataPoints)-1]
m.Value = last[0]
m.Timestamp = int64(last[1])
for k, v := range res.Tags {
m.AddLabel(k, v)
}
ms = append(ms, m)
}
return ms
}
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct {
c *http.Client
@@ -81,13 +18,43 @@ type VMStorage struct {
appendTypePrefix bool
lookBack time.Duration
queryStep time.Duration
roundDigits string
dataSourceType Type
evaluationInterval time.Duration
extraLabels []string
}
const queryPath = "/api/v1/query"
const graphitePath = "/render"
// Clone makes clone of VMStorage, shares http client.
func (s *VMStorage) Clone() *VMStorage {
return &VMStorage{
c: s.c,
datasourceURL: s.datasourceURL,
basicAuthUser: s.basicAuthUser,
basicAuthPass: s.basicAuthPass,
lookBack: s.lookBack,
queryStep: s.queryStep,
appendTypePrefix: s.appendTypePrefix,
dataSourceType: s.dataSourceType,
}
}
const prometheusPrefix = "/prometheus"
const graphitePrefix = "/graphite"
// ApplyParams - changes given querier params.
func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
if params.DataSourceType != nil {
s.dataSourceType = *params.DataSourceType
}
s.evaluationInterval = params.EvaluationInterval
for k, v := range params.ExtraLabels {
s.extraLabels = append(s.extraLabels, fmt.Sprintf("%s=%s", k, v))
}
return s
}
// BuildWithParams - implements interface.
func (s *VMStorage) BuildWithParams(params QuerierParams) Querier {
return s.Clone().ApplyParams(params)
}
// NewVMStorage is a constructor for VMStorage
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Duration, queryStep time.Duration, appendTypePrefix bool, c *http.Client) *VMStorage {
@@ -99,27 +66,84 @@ func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Du
appendTypePrefix: appendTypePrefix,
lookBack: lookBack,
queryStep: queryStep,
dataSourceType: NewPrometheusType(),
}
}
// Query reads metrics from datasource by given query and type
func (s *VMStorage) Query(ctx context.Context, query string, dataSourceType Type) ([]Metric, error) {
switch dataSourceType.name {
// Query executes the given query and returns parsed response
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
req, err := s.newRequestPOST()
if err != nil {
return nil, err
}
ts := time.Now()
switch s.dataSourceType.name {
case "", prometheusType:
return s.queryDataSource(ctx, query, s.setPrometheusReqParams, parsePrometheusResponse)
s.setPrometheusInstantReqParams(req, query, ts)
case graphiteType:
return s.queryDataSource(ctx, query, s.setGraphiteReqParams, parseGraphiteResponse)
s.setGraphiteReqParams(req, query, ts)
default:
return nil, fmt.Errorf("engine not found: %q", dataSourceType)
return nil, fmt.Errorf("engine not found: %q", s.dataSourceType.name)
}
resp, err := s.do(ctx, req)
if err != nil {
return nil, err
}
defer func() {
_ = resp.Body.Close()
}()
parseFn := parsePrometheusResponse
if s.dataSourceType.name != prometheusType {
parseFn = parseGraphiteResponse
}
return parseFn(req, resp)
}
func (s *VMStorage) queryDataSource(
ctx context.Context,
query string,
setReqParams func(r *http.Request, query string),
processResponse func(r *http.Request, resp *http.Response,
) ([]Metric, error)) ([]Metric, error) {
// QueryRange executes the given query on the given time range.
// For Prometheus type see https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries
// Graphite type isn't supported.
func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end time.Time) ([]Metric, error) {
if s.dataSourceType.name != prometheusType {
return nil, fmt.Errorf("%q is not supported for QueryRange", s.dataSourceType.name)
}
req, err := s.newRequestPOST()
if err != nil {
return nil, err
}
if start.IsZero() {
return nil, fmt.Errorf("start param is missing")
}
if end.IsZero() {
return nil, fmt.Errorf("end param is missing")
}
s.setPrometheusRangeReqParams(req, query, start, end)
resp, err := s.do(ctx, req)
if err != nil {
return nil, err
}
defer func() {
_ = resp.Body.Close()
}()
return parsePrometheusResponse(req, resp)
}
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
}
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
_ = resp.Body.Close()
return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
}
return resp, nil
}
func (s *VMStorage) newRequestPOST() (*http.Request, error) {
req, err := http.NewRequest("POST", s.datasourceURL, nil)
if err != nil {
return nil, err
@@ -128,79 +152,5 @@ func (s *VMStorage) queryDataSource(
if s.basicAuthPass != "" {
req.SetBasicAuth(s.basicAuthUser, s.basicAuthPass)
}
setReqParams(req, query)
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
}
return processResponse(req, resp)
}
func (s *VMStorage) setPrometheusReqParams(r *http.Request, query string) {
if s.appendTypePrefix {
r.URL.Path += prometheusPrefix
}
r.URL.Path += queryPath
q := r.URL.Query()
q.Set("query", query)
if s.lookBack > 0 {
lookBack := time.Now().Add(-s.lookBack)
q.Set("time", fmt.Sprintf("%d", lookBack.Unix()))
}
if s.queryStep > 0 {
q.Set("step", s.queryStep.String())
}
r.URL.RawQuery = q.Encode()
}
func (s *VMStorage) setGraphiteReqParams(r *http.Request, query string) {
if s.appendTypePrefix {
r.URL.Path += graphitePrefix
}
r.URL.Path += graphitePath
q := r.URL.Query()
q.Set("format", "json")
q.Set("target", query)
from := "-5min"
if s.lookBack > 0 {
lookBack := time.Now().Add(-s.lookBack)
from = strconv.FormatInt(lookBack.Unix(), 10)
}
q.Set("from", from)
q.Set("until", "now")
r.URL.RawQuery = q.Encode()
}
const (
statusSuccess, statusError, rtVector = "success", "error", "vector"
)
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &response{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
if r.Data.ResultType != rtVector {
return nil, fmt.Errorf("unknown result type:%s. Expected vector", r.Data.ResultType)
}
return r.metrics()
}
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &graphiteResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err)
}
return r.metrics(), nil
return req, nil
}

View File

@@ -0,0 +1,67 @@
package datasource
import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"time"
)
type graphiteResponse []graphiteResponseTarget
type graphiteResponseTarget struct {
Target string `json:"target"`
Tags map[string]string `json:"tags"`
DataPoints [][2]float64 `json:"datapoints"`
}
func (r graphiteResponse) metrics() []Metric {
var ms []Metric
for _, res := range r {
if len(res.DataPoints) < 1 {
continue
}
var m Metric
// add only last value to the result.
last := res.DataPoints[len(res.DataPoints)-1]
m.Values = append(m.Values, last[0])
m.Timestamps = append(m.Timestamps, int64(last[1]))
for k, v := range res.Tags {
m.AddLabel(k, v)
}
ms = append(ms, m)
}
return ms
}
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &graphiteResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err)
}
return r.metrics(), nil
}
const (
graphitePath = "/render"
graphitePrefix = "/graphite"
)
func (s *VMStorage) setGraphiteReqParams(r *http.Request, query string, timestamp time.Time) {
if s.appendTypePrefix {
r.URL.Path += graphitePrefix
}
r.URL.Path += graphitePath
q := r.URL.Query()
q.Set("format", "json")
q.Set("target", query)
from := "-5min"
if s.lookBack > 0 {
lookBack := timestamp.Add(-s.lookBack)
from = strconv.FormatInt(lookBack.Unix(), 10)
}
q.Set("from", from)
q.Set("until", "now")
r.URL.RawQuery = q.Encode()
}

View File

@@ -0,0 +1,165 @@
package datasource
import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"time"
)
type promResponse struct {
Status string `json:"status"`
ErrorType string `json:"errorType"`
Error string `json:"error"`
Data struct {
ResultType string `json:"resultType"`
Result json.RawMessage `json:"result"`
} `json:"data"`
}
type promInstant struct {
Result []struct {
Labels map[string]string `json:"metric"`
TV [2]interface{} `json:"value"`
} `json:"result"`
}
type promRange struct {
Result []struct {
Labels map[string]string `json:"metric"`
TVs [][2]interface{} `json:"values"`
} `json:"result"`
}
func (r promInstant) metrics() ([]Metric, error) {
var result []Metric
for i, res := range r.Result {
f, err := strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
}
var m Metric
for k, v := range r.Result[i].Labels {
m.AddLabel(k, v)
}
m.Timestamps = append(m.Timestamps, int64(res.TV[0].(float64)))
m.Values = append(m.Values, f)
result = append(result, m)
}
return result, nil
}
func (r promRange) metrics() ([]Metric, error) {
var result []Metric
for i, res := range r.Result {
var m Metric
for _, tv := range res.TVs {
f, err := strconv.ParseFloat(tv[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, tv[1], err)
}
m.Values = append(m.Values, f)
m.Timestamps = append(m.Timestamps, int64(tv[0].(float64)))
}
if len(m.Values) < 1 || len(m.Timestamps) < 1 {
return nil, fmt.Errorf("metric %v contains no values", res)
}
m.Labels = nil
for k, v := range r.Result[i].Labels {
m.AddLabel(k, v)
}
result = append(result, m)
}
return result, nil
}
const (
statusSuccess, statusError = "success", "error"
rtVector, rtMatrix = "vector", "matrix"
)
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &promResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
switch r.Data.ResultType {
case rtVector:
var pi promInstant
if err := json.Unmarshal(r.Data.Result, &pi.Result); err != nil {
return nil, fmt.Errorf("umarshal err %s; \n %#v", err, string(r.Data.Result))
}
return pi.metrics()
case rtMatrix:
var pr promRange
if err := json.Unmarshal(r.Data.Result, &pr.Result); err != nil {
return nil, err
}
return pr.metrics()
default:
return nil, fmt.Errorf("unknown result type %q", r.Data.ResultType)
}
}
const (
prometheusInstantPath = "/api/v1/query"
prometheusRangePath = "/api/v1/query_range"
prometheusPrefix = "/prometheus"
)
func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string, timestamp time.Time) {
if s.appendTypePrefix {
r.URL.Path += prometheusPrefix
}
r.URL.Path += prometheusInstantPath
q := r.URL.Query()
if s.lookBack > 0 {
timestamp = timestamp.Add(-s.lookBack)
}
if s.evaluationInterval > 0 {
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1232
timestamp = timestamp.Truncate(s.evaluationInterval)
}
q.Set("time", fmt.Sprintf("%d", timestamp.Unix()))
r.URL.RawQuery = q.Encode()
s.setPrometheusReqParams(r, query)
}
func (s *VMStorage) setPrometheusRangeReqParams(r *http.Request, query string, start, end time.Time) {
if s.appendTypePrefix {
r.URL.Path += prometheusPrefix
}
r.URL.Path += prometheusRangePath
q := r.URL.Query()
q.Add("start", fmt.Sprintf("%d", start.Unix()))
q.Add("end", fmt.Sprintf("%d", end.Unix()))
r.URL.RawQuery = q.Encode()
s.setPrometheusReqParams(r, query)
}
func (s *VMStorage) setPrometheusReqParams(r *http.Request, query string) {
q := r.URL.Query()
q.Set("query", query)
if s.evaluationInterval > 0 {
// set step as evaluationInterval by default
q.Set("step", s.evaluationInterval.String())
}
if s.queryStep > 0 {
// override step with user-specified value
q.Set("step", s.queryStep.String())
}
if s.roundDigits != "" {
q.Set("round_digits", s.roundDigits)
}
for _, l := range s.extraLabels {
q.Add("extra_label", l)
}
r.URL.RawQuery = q.Encode()
}

View File

@@ -2,9 +2,12 @@ package datasource
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"reflect"
"strconv"
"strings"
"testing"
"time"
)
@@ -17,7 +20,7 @@ var (
queryRender = "constantLine(10)"
)
func TestVMSelectQuery(t *testing.T) {
func TestVMInstantQuery(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
@@ -63,32 +66,133 @@ func TestVMSelectQuery(t *testing.T) {
case 5:
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
case 6:
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]}]}}`))
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests"},"value":[1583786140,"2000"]}]}}`))
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
s := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
p := NewPrometheusType()
pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second})
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected connection error got nil")
}
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected invalid response status error got nil")
}
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected response body error got nil")
}
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected error status got nil")
}
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected unknown status got nil")
}
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
if _, err := pq.Query(ctx, query); err == nil {
t.Fatalf("expected non-vector resultType error got nil")
}
m, err := am.Query(ctx, query, NewPrometheusType())
m, err := pq.Query(ctx, query)
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 2 {
t.Fatalf("expected 2 metrics got %d in %+v", len(m), m)
}
expected := []Metric{
{
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
Timestamps: []int64{1583786142},
Values: []float64{13763},
},
{
Labels: []Label{{Value: "vm_requests", Name: "__name__"}},
Timestamps: []int64{1583786140},
Values: []float64{2000},
},
}
if !reflect.DeepEqual(m, expected) {
t.Fatalf("unexpected metric %+v want %+v", m, expected)
}
g := NewGraphiteType()
gq := s.BuildWithParams(QuerierParams{DataSourceType: &g})
m, err = gq.Query(ctx, queryRender)
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
exp := Metric{
Labels: []Label{{Value: "constantLine(10)", Name: "name"}},
Timestamps: []int64{1611758403},
Values: []float64{10},
}
if !reflect.DeepEqual(m[0], exp) {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
}
func TestVMRangeQuery(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/api/v1/query_range", func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
}
if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass {
t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass)
}
if r.URL.Query().Get("query") != query {
t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query"))
}
startTS := r.URL.Query().Get("start")
if startTS == "" {
t.Errorf("expected 'start' in query param, got nil instead")
}
if _, err := strconv.ParseInt(startTS, 10, 64); err != nil {
t.Errorf("failed to parse 'start' query param: %s", err)
}
endTS := r.URL.Query().Get("end")
if endTS == "" {
t.Errorf("expected 'end' in query param, got nil instead")
}
if _, err := strconv.ParseInt(endTS, 10, 64); err != nil {
t.Errorf("failed to parse 'end' query param: %s", err)
}
switch c {
case 0:
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"vm_rows"},"values":[[1583786142,"13763"]]}]}}`))
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
s := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, false, srv.Client())
p := NewPrometheusType()
pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second})
_, err := pq.QueryRange(ctx, query, time.Now(), time.Time{})
expectError(t, err, "is missing")
_, err = pq.QueryRange(ctx, query, time.Time{}, time.Now())
expectError(t, err, "is missing")
start, end := time.Now().Add(-time.Minute), time.Now()
m, err := pq.QueryRange(ctx, query, start, end)
if err != nil {
t.Fatalf("unexpected %s", err)
}
@@ -96,32 +200,263 @@ func TestVMSelectQuery(t *testing.T) {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
expected := Metric{
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
Timestamp: 1583786142,
Value: 13763,
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
Timestamps: []int64{1583786142},
Values: []float64{13763},
}
if m[0].Timestamp != expected.Timestamp &&
m[0].Value != expected.Value &&
m[0].Labels[0].Value != expected.Labels[0].Value &&
m[0].Labels[0].Name != expected.Labels[0].Name {
if !reflect.DeepEqual(m[0], expected) {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
m, err = am.Query(ctx, queryRender, NewGraphiteType())
if err != nil {
t.Fatalf("unexpected %s", err)
g := NewGraphiteType()
gq := s.BuildWithParams(QuerierParams{DataSourceType: &g})
_, err = gq.QueryRange(ctx, queryRender, start, end)
expectError(t, err, "is not supported")
}
func TestRequestParams(t *testing.T) {
query := "up"
timestamp := time.Date(2001, 2, 3, 4, 5, 6, 0, time.UTC)
testCases := []struct {
name string
queryRange bool
vm *VMStorage
checkFn func(t *testing.T, r *http.Request)
}{
{
"prometheus path",
false,
&VMStorage{
dataSourceType: NewPrometheusType(),
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, prometheusInstantPath, r.URL.Path)
},
},
{
"prometheus prefix",
false,
&VMStorage{
dataSourceType: NewPrometheusType(),
appendTypePrefix: true,
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, prometheusPrefix+prometheusInstantPath, r.URL.Path)
},
},
{
"prometheus range path",
true,
&VMStorage{
dataSourceType: NewPrometheusType(),
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, prometheusRangePath, r.URL.Path)
},
},
{
"prometheus range prefix",
true,
&VMStorage{
dataSourceType: NewPrometheusType(),
appendTypePrefix: true,
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, prometheusPrefix+prometheusRangePath, r.URL.Path)
},
},
{
"graphite path",
false,
&VMStorage{
dataSourceType: NewGraphiteType(),
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, graphitePath, r.URL.Path)
},
},
{
"graphite prefix",
false,
&VMStorage{
dataSourceType: NewGraphiteType(),
appendTypePrefix: true,
},
func(t *testing.T, r *http.Request) {
checkEqualString(t, graphitePrefix+graphitePath, r.URL.Path)
},
},
{
"default params",
false,
&VMStorage{},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("query=%s&time=%d", query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"default range params",
true,
&VMStorage{},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("end=%d&query=%s&start=%d", timestamp.Unix(), query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"basic auth",
false,
&VMStorage{
basicAuthUser: "foo",
basicAuthPass: "bar",
},
func(t *testing.T, r *http.Request) {
u, p, _ := r.BasicAuth()
checkEqualString(t, "foo", u)
checkEqualString(t, "bar", p)
},
},
{
"basic auth range",
true,
&VMStorage{
basicAuthUser: "foo",
basicAuthPass: "bar",
},
func(t *testing.T, r *http.Request) {
u, p, _ := r.BasicAuth()
checkEqualString(t, "foo", u)
checkEqualString(t, "bar", p)
},
},
{
"lookback",
false,
&VMStorage{
lookBack: time.Minute,
},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("query=%s&time=%d", query, timestamp.Add(-time.Minute).Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"evaluation interval",
false,
&VMStorage{
evaluationInterval: 15 * time.Second,
},
func(t *testing.T, r *http.Request) {
evalInterval := 15 * time.Second
tt := timestamp.Truncate(evalInterval)
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, evalInterval, tt.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"lookback + evaluation interval",
false,
&VMStorage{
lookBack: time.Minute,
evaluationInterval: 15 * time.Second,
},
func(t *testing.T, r *http.Request) {
evalInterval := 15 * time.Second
tt := timestamp.Add(-time.Minute)
tt = tt.Truncate(evalInterval)
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, evalInterval, tt.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"step override",
false,
&VMStorage{
queryStep: time.Minute,
},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("query=%s&step=%v&time=%d", query, time.Minute, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"round digits",
false,
&VMStorage{
roundDigits: "10",
},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("query=%s&round_digits=10&time=%d", query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"extra labels",
false,
&VMStorage{
extraLabels: []string{
"env=prod",
"query=es=cape",
},
},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("extra_label=env%%3Dprod&extra_label=query%%3Des%%3Dcape&query=%s&time=%d", query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
{
"extra labels range",
true,
&VMStorage{
extraLabels: []string{
"env=prod",
"query=es=cape",
},
},
func(t *testing.T, r *http.Request) {
exp := fmt.Sprintf("end=%d&extra_label=env%%3Dprod&extra_label=query%%3Des%%3Dcape&query=%s&start=%d",
timestamp.Unix(), query, timestamp.Unix())
checkEqualString(t, exp, r.URL.RawQuery)
},
},
}
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
expected = Metric{
Labels: []Label{{Value: "constantLine(10)", Name: "name"}},
Timestamp: 1611758403,
Value: 10,
}
if m[0].Timestamp != expected.Timestamp &&
m[0].Value != expected.Value &&
m[0].Labels[0].Value != expected.Labels[0].Value &&
m[0].Labels[0].Name != expected.Labels[0].Name {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
req, err := tc.vm.newRequestPOST()
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
switch tc.vm.dataSourceType.name {
case "", prometheusType:
if tc.queryRange {
tc.vm.setPrometheusRangeReqParams(req, query, timestamp, timestamp)
} else {
tc.vm.setPrometheusInstantReqParams(req, query, timestamp)
}
case graphiteType:
tc.vm.setGraphiteReqParams(req, query, timestamp)
}
tc.checkFn(t, req)
})
}
}
func checkEqualString(t *testing.T, exp, got string) {
t.Helper()
if got != exp {
t.Errorf("expected to get %q; got %q", exp, got)
}
}
func expectError(t *testing.T, err error, exp string) {
t.Helper()
if err == nil {
t.Errorf("expected non-nil error")
}
if !strings.Contains(err.Error(), exp) {
t.Errorf("expected error %q to contain %q", err, exp)
}
}

View File

@@ -18,14 +18,15 @@ import (
// Group is an entity for grouping rules
type Group struct {
mu sync.RWMutex
Name string
File string
Rules []Rule
Type datasource.Type
Interval time.Duration
Concurrency int
Checksum string
mu sync.RWMutex
Name string
File string
Rules []Rule
Type datasource.Type
Interval time.Duration
Concurrency int
Checksum string
ExtraFilterLabels map[string]string
doneCh chan struct{}
finishedCh chan struct{}
@@ -49,17 +50,19 @@ func newGroupMetrics(name, file string) *groupMetrics {
return m
}
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
Type: cfg.Type,
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval.Duration(),
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
ExtraFilterLabels: cfg.ExtraFilterLabels,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
}
g.metrics = newGroupMetrics(g.Name, g.File)
if g.Interval == 0 {
@@ -81,17 +84,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string
}
r.Labels[k] = v
}
rules[i] = g.newRule(r)
rules[i] = g.newRule(qb, r)
}
g.Rules = rules
return g
}
func (g *Group) newRule(rule config.Rule) Rule {
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(g, rule)
return newAlertingRule(qb, g, rule)
}
return newRecordingRule(g, rule)
return newRecordingRule(qb, g, rule)
}
// ID return unique group ID that consists of
@@ -106,7 +109,7 @@ func (g *Group) ID() uint64 {
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, lookback time.Duration, labels map[string]string) error {
for _, rule := range g.Rules {
rr, ok := rule.(*AlertingRule)
if !ok {
@@ -115,6 +118,9 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
if rr.For < 1 {
continue
}
// ignore g.ExtraFilterLabels on purpose, so it
// won't affect the restore procedure.
q := qb.BuildWithParams(datasource.QuerierParams{})
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
@@ -162,6 +168,7 @@ func (g *Group) updateWith(newGroup *Group) error {
}
g.Type = newGroup.Type
g.Concurrency = newGroup.Concurrency
g.ExtraFilterLabels = newGroup.ExtraFilterLabels
g.Checksum = newGroup.Checksum
g.Rules = newRules
return nil
@@ -189,7 +196,7 @@ func (g *Group) close() {
var skipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewrite.Client) {
defer func() { close(g.finishedCh) }()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
@@ -213,7 +220,7 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []not
}
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
e := &executor{querier, nts, rw}
e := &executor{nts, rw}
t := time.NewTicker(g.Interval)
defer t.Stop()
for {
@@ -256,22 +263,16 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []not
}
type executor struct {
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
}
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
res := make(chan error, len(rules))
var returnSeries bool
if e.rw != nil {
returnSeries = true
}
if concurrency == 1 {
// fast path
for _, rule := range rules {
res <- e.exec(ctx, rule, returnSeries, interval)
res <- e.exec(ctx, rule, interval)
}
close(res)
return res
@@ -284,7 +285,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, returnSeries, interval)
res <- e.exec(ctx, r, interval)
<-sem
wg.Done()
}(rule)
@@ -303,14 +304,14 @@ var (
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
execTotal.Inc()
execStart := time.Now()
defer func() {
execDuration.UpdateDuration(execStart)
}()
tss, err := rule.Exec(ctx, e.querier, returnSeries)
tss, err := rule.Exec(ctx)
if err != nil {
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)

View File

@@ -7,7 +7,9 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
func init() {
@@ -32,7 +34,7 @@ func TestUpdateWith(t *testing.T) {
[]config.Rule{{
Alert: "foo",
Expr: "up > 0",
For: config.NewPromDuration(time.Second),
For: utils.NewPromDuration(time.Second),
Labels: map[string]string{
"bar": "baz",
},
@@ -44,7 +46,7 @@ func TestUpdateWith(t *testing.T) {
[]config.Rule{{
Alert: "foo",
Expr: "up > 10",
For: config.NewPromDuration(time.Second),
For: utils.NewPromDuration(time.Second),
Labels: map[string]string{
"baz": "bar",
},
@@ -105,20 +107,32 @@ func TestUpdateWith(t *testing.T) {
{Record: "foo5"},
},
},
{
"update datasource type",
[]config.Rule{
{Alert: "foo1", Type: datasource.NewPrometheusType()},
{Alert: "foo3", Type: datasource.NewGraphiteType()},
},
[]config.Rule{
{Alert: "foo1", Type: datasource.NewGraphiteType()},
{Alert: "foo10", Type: datasource.NewPrometheusType()},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"}
qb := &fakeQuerier{}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(r))
g.Rules = append(g.Rules, g.newRule(qb, r))
}
ng := &Group{Name: "test"}
for _, r := range tc.newRules {
r.ID = config.HashRule(r)
ng.Rules = append(ng.Rules, ng.newRule(r))
ng.Rules = append(ng.Rules, ng.newRule(qb, r))
}
err := g.updateWith(ng)
@@ -156,11 +170,11 @@ func TestGroupStart(t *testing.T) {
t.Fatalf("failed to parse rules: %s", err)
}
const evalInterval = time.Millisecond
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
fn := &fakeNotifier{}
fs := &fakeQuerier{}
fn := &fakeNotifier{}
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
const inst1, inst2, job = "foo", "bar", "baz"
m1 := metricWithLabels(t, "instance", inst1, "job", job)
@@ -195,7 +209,7 @@ func TestGroupStart(t *testing.T) {
fs.add(m1)
fs.add(m2)
go func() {
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
g.start(context.Background(), []notifier.Notifier{fn}, nil)
close(finished)
}()

View File

@@ -7,6 +7,7 @@ import (
"sort"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
@@ -38,7 +39,15 @@ func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Unlock()
}
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ datasource.Type) ([]datasource.Metric, error) {
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fq
}
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) ([]datasource.Metric, error) {
return fq.Query(ctx, q)
}
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
@@ -68,9 +77,16 @@ func (fn *fakeNotifier) getAlerts() []notifier.Alert {
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Value = value
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
@@ -79,7 +95,7 @@ func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
@@ -160,6 +176,9 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
if !reflect.DeepEqual(a.Labels, b.Labels) {
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
if a.Type.String() != b.Type.String() {
return fmt.Errorf("expected to have Type %#v; got %#v", a.Type.String(), b.Type.String())
}
return nil
}

View File

@@ -26,14 +26,17 @@ import (
)
var (
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.`)
rulesCheckInterval = flag.Duration("rule.configCheckInterval", 0, "Interval for checking for changes in '-rule' files. "+
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
@@ -41,12 +44,13 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
)
@@ -64,42 +68,54 @@ func main() {
notifier.InitTemplateFunc(u)
groups, err := config.Parse(*rulePath, true, true)
if err != nil {
logger.Fatalf(err.Error())
logger.Fatalf("failed to parse %q: %s", *rulePath, err)
}
if len(groups) == 0 {
logger.Fatalf("No rules for validation. Please specify path to file(s) with alerting and/or recording rules using `-rule` flag")
}
return
}
if *replayFrom != "" || *replayTo != "" {
rw, err := remotewrite.Init(context.Background())
if err != nil {
logger.Fatalf("failed to init remoteWrite: %s", err)
}
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
logger.Fatalf("failed to init `external.url`: %s", err)
}
notifier.InitTemplateFunc(eu)
groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
if err != nil {
logger.Fatalf("cannot parse configuration file: %s", err)
}
q, err := datasource.Init()
if err != nil {
logger.Fatalf("failed to init datasource: %s", err)
}
if err := replay(groupsCfg, q, rw); err != nil {
logger.Fatalf("replay failed: %s", err)
}
return
}
ctx, cancel := context.WithCancel(context.Background())
manager, err := newManager(ctx)
if err != nil {
logger.Fatalf("failed to init: %s", err)
}
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
logger.Infof("reading rules configuration file from %q", strings.Join(*rulePath, ";"))
groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
if err != nil {
logger.Fatalf("cannot parse configuration file: %s", err)
}
if err := manager.start(ctx, groupsCfg); err != nil {
logger.Fatalf("failed to start: %s", err)
}
go func() {
// init reload metrics with positive values to improve alerting conditions
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
sigHup := procutil.NewSighupChan()
for {
<-sigHup
configReloads.Inc()
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("error while reloading rules: %s", err)
continue
}
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath)
}
}()
go configReload(ctx, manager, groupsCfg)
rh := &requestHandler{m: manager}
go httpserver.Serve(*httpListenAddr, rh.handler)
@@ -140,10 +156,10 @@ func newManager(ctx context.Context) (*manager, error) {
}
manager := &manager{
groups: make(map[uint64]*Group),
querier: q,
notifiers: nts,
labels: map[string]string{},
groups: make(map[uint64]*Group),
querierBuilder: q,
notifiers: nts,
labels: map[string]string{},
}
rw, err := remotewrite.Init(ctx)
if err != nil {
@@ -218,7 +234,66 @@ func usage() {
const s = `
vmalert processes alerts and recording rules.
See the docs at https://victoriametrics.github.io/vmalert.html .
See the docs at https://docs.victoriametrics.com/vmalert.html .
`
flagutil.Usage(s)
}
func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
// Register SIGHUP handler for config re-read just before manager.start call.
// This guarantees that the config will be re-read if the signal arrives during manager.start call.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
sighupCh := procutil.NewSighupChan()
var configCheckCh <-chan time.Time
if *rulesCheckInterval > 0 {
ticker := time.NewTicker(*rulesCheckInterval)
configCheckCh = ticker.C
defer ticker.Stop()
}
// init reload metrics with positive values to improve alerting conditions
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
for {
select {
case <-ctx.Done():
return
case <-sighupCh:
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
configReloads.Inc()
case <-configCheckCh:
}
newGroupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions)
if err != nil {
logger.Errorf("cannot parse configuration file: %s", err)
continue
}
if configsEqual(newGroupsCfg, groupsCfg) {
// config didn't change - skip it
continue
}
groupsCfg = newGroupsCfg
if err := m.update(ctx, groupsCfg, false); err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("error while reloading rules: %s", err)
continue
}
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath)
}
}
func configsEqual(a, b []config.Group) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i].Checksum != b[i].Checksum {
return false
}
}
return true
}

View File

@@ -1,12 +1,16 @@
package main
import (
"context"
"fmt"
"io/ioutil"
"net/url"
"os"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
func TestGetExternalURL(t *testing.T) {
@@ -51,3 +55,95 @@ func TestGetAlertURLGenerator(t *testing.T) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
}
func TestConfigReload(t *testing.T) {
originalRulePath := *rulePath
defer func() {
*rulePath = originalRulePath
}()
const (
rules1 = `
groups:
- name: group-1
rules:
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
`
rules2 = `
groups:
- name: group-1
rules:
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
- name: group-2
rules:
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
`
)
f, err := ioutil.TempFile("", "")
if err != nil {
t.Fatal(err)
}
writeToFile(t, f.Name(), rules1)
*rulesCheckInterval = 200 * time.Millisecond
*rulePath = []string{f.Name()}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
m := &manager{
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*Group),
labels: map[string]string{},
}
go configReload(ctx, m, nil)
lenLocked := func(m *manager) int {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
return len(m.groups)
}
time.Sleep(*rulesCheckInterval * 2)
groupsLen := lenLocked(m)
if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
}
writeToFile(t, f.Name(), rules2)
time.Sleep(*rulesCheckInterval * 2)
groupsLen = lenLocked(m)
if groupsLen != 2 {
fmt.Println(m.groups)
t.Fatalf("expected to have exactly 2 groups loaded; got %d", groupsLen)
}
writeToFile(t, f.Name(), rules1)
procutil.SelfSIGHUP()
time.Sleep(*rulesCheckInterval / 2)
groupsLen = lenLocked(m)
if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
}
writeToFile(t, f.Name(), `corrupted`)
procutil.SelfSIGHUP()
time.Sleep(*rulesCheckInterval / 2)
groupsLen = lenLocked(m)
if groupsLen != 1 { // should remain unchanged
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
}
}
func writeToFile(t *testing.T, file, b string) {
t.Helper()
err := ioutil.WriteFile(file, []byte(b), 0644)
if err != nil {
t.Fatal(err)
}
}

View File

@@ -3,7 +3,6 @@ package main
import (
"context"
"fmt"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
@@ -15,11 +14,12 @@ import (
// manager controls group states
type manager struct {
querier datasource.Querier
notifiers []notifier.Notifier
querierBuilder datasource.QuerierBuilder
notifiers []notifier.Notifier
rw *remotewrite.Client
rr datasource.Querier
// remote read builder.
rr datasource.QuerierBuilder
wg sync.WaitGroup
labels map[string]string
@@ -49,8 +49,8 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
}
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
return m.update(ctx, path, validateTpl, validateExpr, true)
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
return m.update(ctx, groupsCfg, true)
}
func (m *manager) close() {
@@ -63,10 +63,13 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
if !*remoteReadIgnoreRestoreErrors {
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
}
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
}
@@ -74,22 +77,17 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
m.wg.Add(1)
id := group.ID()
go func() {
group.start(ctx, m.querier, m.notifiers, m.rw)
group.start(ctx, m.notifiers, m.rw)
m.wg.Done()
}()
m.groups[id] = group
return nil
}
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
if err != nil {
return fmt.Errorf("cannot parse configuration file: %w", err)
}
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
groupsRegistry := make(map[uint64]*Group)
for _, cfg := range groupsCfg {
ng := newGroup(cfg, *evaluationInterval, m.labels)
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
@@ -116,7 +114,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
}
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
if err := m.startGroup(ctx, ng, restore); err != nil {
return err
}
}
m.groupsMu.Unlock()
@@ -140,12 +140,14 @@ func (g *Group) toAPI() APIGroup {
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.String(),
Concurrency: g.Concurrency,
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.String(),
Concurrency: g.Concurrency,
ExtraFilterLabels: g.ExtraFilterLabels,
}
for _, r := range g.Rules {
switch v := r.(type) {

View File

@@ -9,8 +9,8 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
@@ -25,9 +25,8 @@ func TestMain(m *testing.M) {
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
path := []string{"foo/bar"}
err := m.update(context.Background(), path, true, true, false)
if err != nil {
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
if err := m.update(context.Background(), cfg, false); err != nil {
t.Fatalf("expected to load succesfully with empty rules dir; got err instead: %v", err)
}
}
@@ -37,9 +36,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querier: &fakeQuerier{},
notifiers: []notifier.Notifier{&fakeNotifier{}},
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: []notifier.Notifier{&fakeNotifier{}},
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
@@ -50,8 +49,11 @@ func TestManagerUpdateConcurrent(t *testing.T) {
"config/testdata/rules1-good.rules",
"config/testdata/rules2-good.rules",
}
evalInterval := *evaluationInterval
defer func() { *evaluationInterval = evalInterval }()
*evaluationInterval = time.Millisecond
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
cfg := loadCfg(t, []string{paths[0]}, true, true)
if err := m.start(context.Background(), cfg); err != nil {
t.Fatalf("failed to start: %s", err)
}
@@ -64,8 +66,11 @@ func TestManagerUpdateConcurrent(t *testing.T) {
defer wg.Done()
for i := 0; i < iterations; i++ {
rnd := rand.Intn(len(paths))
path := []string{paths[rnd]}
_ = m.update(context.Background(), path, true, true, false)
cfg, err := config.Parse([]string{paths[rnd]}, true, true)
if err != nil { // update can fail and this is expected
continue
}
_ = m.update(context.Background(), cfg, false)
}
}()
}
@@ -242,14 +247,17 @@ func TestManagerUpdate(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
path := []string{tc.initPath}
if err := m.update(ctx, path, true, true, false); err != nil {
m := &manager{groups: make(map[uint64]*Group), querierBuilder: &fakeQuerier{}}
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
if err := m.update(ctx, cfgInit, false); err != nil {
t.Fatalf("failed to complete initial rules update: %s", err)
}
path = []string{tc.updatePath}
_ = m.update(ctx, path, true, true, false)
cfgUpdate, err := config.Parse([]string{tc.updatePath}, true, true)
if err == nil { // update can fail and that's expected
_ = m.update(ctx, cfgUpdate, false)
}
if len(tc.want) != len(m.groups) {
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
}
@@ -267,3 +275,12 @@ func TestManagerUpdate(t *testing.T) {
})
}
}
func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressions bool) []config.Group {
t.Helper()
cfg, err := config.Parse(path, validateAnnotations, validateExpressions)
if err != nil {
t.Fatal(err)
}
return cfg
}

View File

@@ -83,14 +83,16 @@ func TestAlert_ExecTemplate(t *testing.T) {
{Name: "foo", Value: "bar"},
{Name: "baz", Value: "qux"},
},
Value: 1,
Values: []float64{1},
Timestamps: []int64{1},
},
{
Labels: []datasource.Label{
{Name: "foo", Value: "garply"},
{Name: "baz", Value: "fred"},
},
Value: 2,
Values: []float64{2},
Timestamps: []int64{1},
},
}, nil
}

View File

@@ -28,44 +28,73 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// metric is private copy of datasource.Metric,
// it is used for templating annotations,
// Labels as map simplifies templates evaluation.
type metric struct {
Labels map[string]string
Timestamp int64
Value float64
}
// datasourceMetricsToTemplateMetrics converts Metrics from datasource package to private copy for templating.
func datasourceMetricsToTemplateMetrics(ms []datasource.Metric) []metric {
mss := make([]metric, 0, len(ms))
for _, m := range ms {
labelsMap := make(map[string]string, len(m.Labels))
for _, labelValue := range m.Labels {
labelsMap[labelValue.Name] = labelValue.Value
}
mss = append(mss, metric{
Labels: labelsMap,
Timestamp: m.Timestamps[0],
Value: m.Values[0]})
}
return mss
}
// QueryFn is used to wrap a call to datasource into simple-to-use function
// for templating functions.
type QueryFn func(query string) ([]datasource.Metric, error)
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
fm := make(textTpl.FuncMap)
for k, fn := range tmplFunc {
fm[k] = fn
}
fm["query"] = func(q string) ([]datasource.Metric, error) {
return query(q)
}
return fm
}
var tmplFunc textTpl.FuncMap
// InitTemplateFunc initiates template helper functions
func InitTemplateFunc(externalURL *url.URL) {
tmplFunc = textTpl.FuncMap{
"args": func(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
result[fmt.Sprintf("arg%d", i)] = a
}
return result
},
/* Strings */
// reReplaceAll ReplaceAllString returns a copy of src, replacing matches of the Regexp with
// the replacement string repl. Inside repl, $ signs are interpreted as in Expand,
// so for instance $1 represents the text of the first submatch.
// alias for https://golang.org/pkg/regexp/#Regexp.ReplaceAllString
"reReplaceAll": func(pattern, repl, text string) string {
re := regexp.MustCompile(pattern)
return re.ReplaceAllString(text, repl)
},
"safeHtml": func(text string) htmlTpl.HTML {
return htmlTpl.HTML(text)
},
"match": regexp.MatchString,
"title": strings.Title,
// match reports whether the string s
// contains any match of the regular expression pattern.
// alias for https://golang.org/pkg/regexp/#MatchString
"match": regexp.MatchString,
// title returns a copy of the string s with all Unicode letters
// that begin words mapped to their Unicode title case.
// alias for https://golang.org/pkg/strings/#Title
"title": strings.Title,
// toUpper returns s with all Unicode letters mapped to their upper case.
// alias for https://golang.org/pkg/strings/#ToUpper
"toUpper": strings.ToUpper,
// toLower returns s with all Unicode letters mapped to their lower case.
// alias for https://golang.org/pkg/strings/#ToLower
"toLower": strings.ToLower,
/* Numbers */
// humanize converts given number to a human readable format
// by adding metric prefixes https://en.wikipedia.org/wiki/Metric_prefix
"humanize": func(v float64) string {
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
@@ -91,6 +120,8 @@ func InitTemplateFunc(externalURL *url.URL) {
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
// humanize1024 converts given number to a human readable format with 1024 as base
"humanize1024": func(v float64) string {
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
@@ -105,6 +136,8 @@ func InitTemplateFunc(externalURL *url.URL) {
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
// humanizeDuration converts given seconds to a human readable duration
"humanizeDuration": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
@@ -145,9 +178,13 @@ func InitTemplateFunc(externalURL *url.URL) {
}
return fmt.Sprintf("%.4g%ss", v, prefix)
},
// humanizePercentage converts given ratio value to a fraction of 100
"humanizePercentage": func(v float64) string {
return fmt.Sprintf("%.4g%%", v*100)
},
// humanizeTimestamp converts given timestamp to a human readable time equivalent
"humanizeTimestamp": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
@@ -155,48 +192,115 @@ func InitTemplateFunc(externalURL *url.URL) {
t := TimeFromUnixNano(int64(v * 1e9)).Time().UTC()
return fmt.Sprint(t)
},
"pathPrefix": func() string {
return externalURL.Path
},
/* URLs */
// externalURL returns value of `external.url` flag
"externalURL": func() string {
return externalURL.String()
},
// pathPrefix returns a Path segment from the URL value in `external.url` flag
"pathPrefix": func() string {
return externalURL.Path
},
// pathEscape escapes the string so it can be safely placed inside a URL path segment,
// replacing special characters (including /) with %XX sequences as needed.
// alias for https://golang.org/pkg/net/url/#PathEscape
"pathEscape": func(u string) string {
return url.PathEscape(u)
},
// queryEscape escapes the string so it can be safely placed
// inside a URL query.
// alias for https://golang.org/pkg/net/url/#QueryEscape
"queryEscape": func(q string) string {
return url.QueryEscape(q)
},
// crlfEscape replaces new line chars to skip URL encoding.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/890
"crlfEscape": func(q string) string {
q = strings.Replace(q, "\n", `\n`, -1)
return strings.Replace(q, "\r", `\r`, -1)
},
// quotesEscape escapes quote char
"quotesEscape": func(q string) string {
return strings.Replace(q, `"`, `\"`, -1)
},
// query function supposed to be substituted at funcsWithQuery().
// it is present here only for validation purposes, when there is no
// provided datasource.
"query": func(q string) ([]datasource.Metric, error) {
// query executes the MetricsQL/PromQL query against
// configured `datasource.url` address.
// For example, {{ query "foo" | first | value }} will
// execute "/api/v1/query?query=foo" request and will return
// the first value in response.
"query": func(q string) ([]metric, error) {
// query function supposed to be substituted at funcsWithQuery().
// it is present here only for validation purposes, when there is no
// provided datasource.
//
// return non-empty slice to pass validation with chained functions in template
// see issue #989 for details
return []datasource.Metric{{}}, nil
return []metric{{}}, nil
},
"first": func(metrics []datasource.Metric) (datasource.Metric, error) {
// first returns the first by order element from the given metrics list.
// usually used alongside with `query` template function.
"first": func(metrics []metric) (metric, error) {
if len(metrics) > 0 {
return metrics[0], nil
}
return datasource.Metric{}, errors.New("first() called on vector with no elements")
return metric{}, errors.New("first() called on vector with no elements")
},
"label": func(label string, m datasource.Metric) string {
return m.Label(label)
// label returns the value of the given label name for the given metric.
// usually used alongside with `query` template function.
"label": func(label string, m metric) string {
return m.Labels[label]
},
"value": func(m datasource.Metric) float64 {
// value returns the value of the given metric.
// usually used alongside with `query` template function.
"value": func(m metric) float64 {
return m.Value
},
/* Helpers */
// Converts a list of objects to a map with keys arg0, arg1 etc.
// This is intended to allow multiple arguments to be passed to templates.
"args": func(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
result[fmt.Sprintf("arg%d", i)] = a
}
return result
},
// safeHtml marks string as HTML not requiring auto-escaping.
"safeHtml": func(text string) htmlTpl.HTML {
return htmlTpl.HTML(text)
},
}
}
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
fm := make(textTpl.FuncMap)
for k, fn := range tmplFunc {
fm[k] = fn
}
fm["query"] = func(q string) ([]metric, error) {
result, err := query(q)
if err != nil {
return nil, err
}
return datasourceMetricsToTemplateMetrics(result), nil
}
return fm
}
// Time is the number of milliseconds since the epoch
// (1970-01-01 00:00 UTC) excluding leap seconds.
type Time int64

View File

@@ -3,8 +3,8 @@ package main
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strings"
"sync"
"time"
@@ -25,6 +25,8 @@ type RecordingRule struct {
Labels map[string]string
GroupID uint64
q datasource.Querier
// guard status fields
mu sync.RWMutex
// stores last moment of time Exec was called
@@ -52,7 +54,7 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
Type: cfg.Type,
RuleID: cfg.ID,
@@ -61,6 +63,11 @@ func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
Labels: cfg.Labels,
GroupID: group.ID(),
metrics: &recordingRuleMetrics{},
q: qb.BuildWithParams(datasource.QuerierParams{
DataSourceType: &cfg.Type,
EvaluationInterval: group.Interval,
ExtraLabels: group.ExtraFilterLabels,
}),
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
@@ -81,13 +88,31 @@ func (rr *RecordingRule) Close() {
metrics.UnregisterMetric(rr.metrics.errors.name)
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
if !series {
return nil, nil
// ExecRange executes recording rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
series, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil {
return nil, err
}
duplicates := make(map[string]struct{}, len(series))
var tss []prompbmarshal.TimeSeries
for _, s := range series {
ts := rr.toTimeSeries(s)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", s.Labels, key, errDuplicate)
}
duplicates[key] = struct{}{}
tss = append(tss, ts)
}
return tss, nil
}
qMetrics, err := q.Query(ctx, rr.Expr, rr.Type)
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := rr.q.Query(ctx, rr.Expr)
rr.mu.Lock()
defer rr.mu.Unlock()
@@ -97,36 +122,41 @@ func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
}
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
duplicates := make(map[string]struct{}, len(qMetrics))
var tss []prompbmarshal.TimeSeries
for _, r := range qMetrics {
ts := rr.toTimeSeries(r, rr.lastExecTime)
h := hashTimeSeries(ts)
if _, ok := duplicates[h]; ok {
ts := rr.toTimeSeries(r)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
rr.lastExecError = errDuplicate
return nil, errDuplicate
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
}
duplicates[h] = ts
duplicates[key] = struct{}{}
tss = append(tss, ts)
}
return tss, nil
}
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
hash := fnv.New64a()
func stringifyLabels(ts prompbmarshal.TimeSeries) string {
labels := ts.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
if len(labels) > 1 {
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
}
return hash.Sum64()
b := strings.Builder{}
for i, l := range labels {
b.WriteString(l.Name)
b.WriteString("=")
b.WriteString(l.Value)
if i != len(labels)-1 {
b.WriteString(",")
}
}
return b.String()
}
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for _, l := range m.Labels {
labels[l.Name] = l.Value
@@ -136,12 +166,10 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time)
for k, v := range rr.Labels {
labels[k] = v
}
return newTimeSeries(m.Value, labels, timestamp)
return newTimeSeries(m.Values, m.Timestamps, labels)
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (rr *RecordingRule) UpdateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
@@ -149,6 +177,7 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
}
rr.Expr = nr.Expr
rr.Labels = nr.Labels
rr.q = nr.q
return nil
}

View File

@@ -11,7 +11,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
func TestRecoridngRule_Exec(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *RecordingRule
@@ -24,9 +24,9 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
"__name__", "bar",
)},
[]prompbmarshal.TimeSeries{
newTimeSeries(10, map[string]string{
newTimeSeries([]float64{10}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "foo",
}, timestamp),
}),
},
},
{
@@ -37,18 +37,18 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "foobarbaz",
"job": "foo",
}, timestamp),
newTimeSeries(2, map[string]string{
}),
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "foobarbaz",
"job": "bar",
}, timestamp),
newTimeSeries(3, map[string]string{
}),
newTimeSeries([]float64{3}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "foobarbaz",
"job": "baz",
}, timestamp),
}),
},
},
{
@@ -59,16 +59,16 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
[]prompbmarshal.TimeSeries{
newTimeSeries(2, map[string]string{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
"job": "foo",
"source": "test",
}, timestamp),
newTimeSeries(1, map[string]string{
}),
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
"job": "bar",
"source": "test",
}, timestamp),
}),
},
},
}
@@ -76,7 +76,8 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
tss, err := tc.rule.Exec(context.TODO(), fq, true)
tc.rule.q = fq
tss, err := tc.rule.Exec(context.TODO())
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@@ -87,7 +88,88 @@ func TestRecoridngRule_ToTimeSeries(t *testing.T) {
}
}
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
func TestRecoridngRule_ExecRange(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *RecordingRule
metrics []datasource.Metric
expTS []prompbmarshal.TimeSeries
}{
{
&RecordingRule{Name: "foo"},
[]datasource.Metric{metricWithValuesAndLabels(t, []float64{10, 20, 30},
"__name__", "bar",
)},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{10, 20, 30},
[]int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()},
map[string]string{
"__name__": "foo",
}),
},
},
{
&RecordingRule{Name: "foobarbaz"},
[]datasource.Metric{
metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"),
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "foobarbaz",
"job": "foo",
}),
newTimeSeries([]float64{2, 3}, []int64{timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{
"__name__": "foobarbaz",
"job": "bar",
}),
newTimeSeries([]float64{4, 5, 6},
[]int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()},
map[string]string{
"__name__": "foobarbaz",
"job": "baz",
}),
},
},
{
&RecordingRule{Name: "job:foo", Labels: map[string]string{
"source": "test",
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
"job": "foo",
"source": "test",
}),
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
"job": "bar",
"source": "test",
}),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
tc.rule.q = fq
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func TestRecoridngRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
"job": "test",
}}
@@ -95,8 +177,8 @@ func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
fq := &fakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err := rr.Exec(context.TODO(), fq, true)
rr.q = fq
_, err := rr.Exec(context.TODO())
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@@ -111,7 +193,7 @@ func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), fq, true)
_, err = rr.Exec(context.TODO())
if err == nil {
t.Fatalf("expected to get err; got nil")
}

View File

@@ -10,9 +10,9 @@ import (
)
var (
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
addr = flag.String("remoteRead.url", "", "Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts "+
"state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state. "+
"E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
@@ -26,7 +26,7 @@ var (
// Init creates a Querier from provided flag values.
// Returns nil if addr flag wasn't set.
func Init() (datasource.Querier, error) {
func Init() (datasource.QuerierBuilder, error) {
if *addr == "" {
return nil, nil
}

View File

@@ -10,8 +10,8 @@ import (
)
var (
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
addr = flag.String("remoteWrite.url", "", "Optional URL to VictoriaMetrics or vminsert where to persist alerts state "+
"and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")

160
app/vmalert/replay.go Normal file
View File

@@ -0,0 +1,160 @@
package main
import (
"context"
"flag"
"fmt"
"strings"
"time"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
var (
replayFrom = flag.String("replay.timeFrom", "",
"The time filter in RFC3339 format to select time series with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'")
replayTo = flag.String("replay.timeTo", "",
"The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'")
replayRulesDelay = flag.Duration("replay.rulesDelay", time.Second,
"Delay between rules evaluation within the group. Could be important if there are chained rules inside of the group"+
"and processing need to wait for previous rule results to be persisted by remote storage before evaluating the next rule."+
"Keep it equal or bigger than -remoteWrite.flushInterval.")
replayMaxDatapoints = flag.Int("replay.maxDatapointsPerQuery", 1e3,
"Max number of data points expected in one request. The higher the value, the less requests will be made during replay.")
replayRuleRetryAttempts = flag.Int("replay.ruleRetryAttempts", 5,
"Defines how many retries to make before giving up on rule if request for it returns an error.")
)
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
if *replayMaxDatapoints < 1 {
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
}
tFrom, err := time.Parse(time.RFC3339, *replayFrom)
if err != nil {
return fmt.Errorf("failed to parse %q: %s", *replayFrom, err)
}
tTo, err := time.Parse(time.RFC3339, *replayTo)
if err != nil {
return fmt.Errorf("failed to parse %q: %s", *replayTo, err)
}
if !tTo.After(tFrom) {
return fmt.Errorf("replay.timeTo must be bigger than replay.timeFrom")
}
labels := make(map[string]string)
for _, s := range *externalLabels {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
return fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
}
labels[s[:n]] = s[n+1:]
}
fmt.Printf("Replay mode:"+
"\nfrom: \t%v "+
"\nto: \t%v "+
"\nmax data points per request: %d\n",
tFrom, tTo, *replayMaxDatapoints)
var total int
for _, cfg := range groupsCfg {
ng := newGroup(cfg, qb, *evaluationInterval, labels)
total += ng.replay(tFrom, tTo, rw)
}
logger.Infof("replay finished! Imported %d samples", total)
if rw != nil {
return rw.Close()
}
return nil
}
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
var total int
step := g.Interval * time.Duration(*replayMaxDatapoints)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
bar := pb.StartNew(iterations)
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
bar.Increment()
}
bar.Finish()
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(*replayRulesDelay)
}
return total
}
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < *replayRuleRetryAttempts; i++ {
tss, err = rule.ExecRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}

250
app/vmalert/replay_test.go Normal file
View File

@@ -0,0 +1,250 @@
package main
import (
"context"
"fmt"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
type fakeReplayQuerier struct {
fakeQuerier
registry map[string]map[string]struct{}
}
func (fr *fakeReplayQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fr
}
func (fr *fakeReplayQuerier) QueryRange(_ context.Context, q string, from, to time.Time) ([]datasource.Metric, error) {
key := fmt.Sprintf("%s+%s", from.Format("15:04:05"), to.Format("15:04:05"))
dps, ok := fr.registry[q]
if !ok {
return nil, fmt.Errorf("unexpected query received: %q", q)
}
_, ok = dps[key]
if !ok {
return nil, fmt.Errorf("unexpected time range received: %q", key)
}
delete(dps, key)
if len(fr.registry[q]) < 1 {
delete(fr.registry, q)
}
return nil, nil
}
func TestReplay(t *testing.T) {
testCases := []struct {
name string
from, to string
maxDP int
cfg []config.Group
qb *fakeReplayQuerier
}{
{
name: "one rule + one response",
from: "2021-01-01T12:00:00.000Z",
to: "2021-01-01T12:02:00.000Z",
maxDP: 10,
cfg: []config.Group{
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
},
qb: &fakeReplayQuerier{
registry: map[string]map[string]struct{}{
"sum(up)": {"12:00:00+12:02:00": {}},
},
},
},
{
name: "one rule + multiple responses",
from: "2021-01-01T12:00:00.000Z",
to: "2021-01-01T12:02:30.000Z",
maxDP: 1,
cfg: []config.Group{
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
},
qb: &fakeReplayQuerier{
registry: map[string]map[string]struct{}{
"sum(up)": {
"12:00:00+12:01:00": {},
"12:01:00+12:02:00": {},
"12:02:00+12:02:30": {},
},
},
},
},
{
name: "datapoints per step",
from: "2021-01-01T12:00:00.000Z",
to: "2021-01-01T15:02:30.000Z",
maxDP: 60,
cfg: []config.Group{
{Interval: utils.NewPromDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
},
qb: &fakeReplayQuerier{
registry: map[string]map[string]struct{}{
"sum(up)": {
"12:00:00+13:00:00": {},
"13:00:00+14:00:00": {},
"14:00:00+15:00:00": {},
"15:00:00+15:02:30": {},
},
},
},
},
{
name: "multiple recording rules + multiple responses",
from: "2021-01-01T12:00:00.000Z",
to: "2021-01-01T12:02:30.000Z",
maxDP: 1,
cfg: []config.Group{
{Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}},
{Rules: []config.Rule{{Record: "bar", Expr: "max(up)"}}},
},
qb: &fakeReplayQuerier{
registry: map[string]map[string]struct{}{
"sum(up)": {
"12:00:00+12:01:00": {},
"12:01:00+12:02:00": {},
"12:02:00+12:02:30": {},
},
"max(up)": {
"12:00:00+12:01:00": {},
"12:01:00+12:02:00": {},
"12:02:00+12:02:30": {},
},
},
},
},
{
name: "multiple alerting rules + multiple responses",
from: "2021-01-01T12:00:00.000Z",
to: "2021-01-01T12:02:30.000Z",
maxDP: 1,
cfg: []config.Group{
{Rules: []config.Rule{{Alert: "foo", Expr: "sum(up) > 1"}}},
{Rules: []config.Rule{{Alert: "bar", Expr: "max(up) < 1"}}},
},
qb: &fakeReplayQuerier{
registry: map[string]map[string]struct{}{
"sum(up) > 1": {
"12:00:00+12:01:00": {},
"12:01:00+12:02:00": {},
"12:02:00+12:02:30": {},
},
"max(up) < 1": {
"12:00:00+12:01:00": {},
"12:01:00+12:02:00": {},
"12:02:00+12:02:30": {},
},
},
},
},
}
from, to, maxDP := *replayFrom, *replayTo, *replayMaxDatapoints
retries, delay := *replayRuleRetryAttempts, *replayRulesDelay
defer func() {
*replayFrom, *replayTo = from, to
*replayMaxDatapoints, *replayRuleRetryAttempts = maxDP, retries
*replayRulesDelay = delay
}()
*replayRuleRetryAttempts = 1
*replayRulesDelay = time.Millisecond
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
*replayFrom = tc.from
*replayTo = tc.to
*replayMaxDatapoints = tc.maxDP
if err := replay(tc.cfg, tc.qb, nil); err != nil {
t.Fatalf("replay failed: %s", err)
}
if len(tc.qb.registry) > 0 {
t.Fatalf("not all requests were sent: %#v", tc.qb.registry)
}
})
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View File

@@ -3,22 +3,21 @@ package main
import (
"context"
"errors"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"time"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// Returns unique ID that may be used for
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context
// and Querier. If returnSeries is true, Exec
// may return TimeSeries as result of execution
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error)
// ExecRange executes the rule on the given time range
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error

View File

@@ -7,17 +7,21 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{}
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamp.UnixNano() / 1e6,
})
func newTimeSeries(values []float64, timestamps []int64, labels map[string]string) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{
Samples: make([]prompbmarshal.Sample, len(values)),
}
for i := range values {
ts.Samples[i] = prompbmarshal.Sample{
Value: values[i],
Timestamp: time.Unix(timestamps[i], 0).UnixNano() / 1e6,
}
}
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
sort.Strings(keys) // make order deterministic
for _, key := range keys {
ts.Labels = append(ts.Labels, prompbmarshal.Label{
Name: key,

View File

@@ -0,0 +1,43 @@
package utils
import (
"time"
"github.com/VictoriaMetrics/metricsql"
)
// PromDuration is Prometheus duration.
type PromDuration struct {
milliseconds int64
}
// NewPromDuration returns PromDuration for given d.
func NewPromDuration(d time.Duration) PromDuration {
return PromDuration{
milliseconds: d.Milliseconds(),
}
}
// MarshalYAML implements yaml.Marshaler interface.
func (pd PromDuration) MarshalYAML() (interface{}, error) {
return pd.Duration().String(), nil
}
// UnmarshalYAML implements yaml.Unmarshaler interface.
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var s string
if err := unmarshal(&s); err != nil {
return err
}
ms, err := metricsql.DurationValue(s, 0)
if err != nil {
return err
}
pd.milliseconds = ms
return nil
}
// Duration returns duration for pd.
func (pd *PromDuration) Duration() time.Duration {
return time.Duration(pd.milliseconds) * time.Millisecond
}

View File

@@ -13,6 +13,7 @@ func TestTLSConfig(t *testing.T) {
}
if tlsCfg == nil {
t.Errorf("expected tlsConfig to be set, got nil")
return
}
if tlsCfg.ServerName != serverName {
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)

View File

@@ -17,27 +17,24 @@ type requestHandler struct {
m *manager
}
var pathList = [][]string{
{"/api/v1/groups", "list all loaded groups and rules"},
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
// /metrics is served by httpserver by default
{"/metrics", "list of application metrics"},
{"/-/reload", "reload configuration"},
}
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/":
for _, path := range pathList {
p, doc := path[0], path[1]
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
if r.Method != "GET" {
return false
}
httpserver.WriteAPIHelp(w, [][2]string{
{"/api/v1/groups", "list all loaded groups and rules"},
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
{"/metrics", "list of application metrics"},
{"/-/reload", "reload configuration"},
})
return true
case "/api/v1/groups":
data, err := rh.listGroups()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
@@ -46,7 +43,7 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
case "/api/v1/alerts":
data, err := rh.listAlerts()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
@@ -64,7 +61,7 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
// /api/v1/<groupName>/<alertID>/status
data, err := rh.alert(r.URL.Path)
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")

View File

@@ -20,14 +20,15 @@ type APIAlert struct {
// APIGroup represents Group for WEB view
type APIGroup struct {
Name string `json:"name"`
Type string `json:"type"`
ID string `json:"id"`
File string `json:"file"`
Interval string `json:"interval"`
Concurrency int `json:"concurrency"`
AlertingRules []APIAlertingRule `json:"alerting_rules"`
RecordingRules []APIRecordingRule `json:"recording_rules"`
Name string `json:"name"`
Type string `json:"type"`
ID string `json:"id"`
File string `json:"file"`
Interval string `json:"interval"`
Concurrency int `json:"concurrency"`
ExtraFilterLabels map[string]string `json:"extra_filter_labels"`
AlertingRules []APIAlertingRule `json:"alerting_rules"`
RecordingRules []APIRecordingRule `json:"recording_rules"`
}
// APIAlertingRule represents AlertingRule for WEB view

View File

@@ -77,3 +77,9 @@ vmauth-local-with-goarch:
vmauth-pure:
APP_NAME=vmauth $(MAKE) app-local-pure
vmauth-windows-amd64:
GOARCH=amd64 APP_NAME=vmauth $(MAKE) app-local-windows-with-goarch
vmauth-windows-amd64-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-windows-amd64

View File

@@ -1,8 +1,8 @@
## vmauth
# vmauth
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
`vmauth` is a simple auth proxy, router and [load balancer](#load-balancing) for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
It reads auth credentials from `Authorization` http header ([Basic Auth](https://en.wikipedia.org/wiki/Basic_access_authentication) and `Bearer token` is supported),
matches them against configs pointed by [-auth.config](#auth-config) command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
## Quick start
@@ -17,18 +17,24 @@ and pass the following flag to `vmauth` binary in order to start authorizing and
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
The auth config can be reloaded either by passing `SIGHUP` signal to `vmauth` or by querying `/-/reload` http endpoint.
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML,
accounting and rate limiting such as [vmgateway](https://docs.victoriametrics.com/vmgateway.html).
## Load balancing
Each `url_prefix` in the [-auth.config](#auth-config) may contain either a single url or a list of urls. In the latter case `vmauth` balances load among the configured urls in a round-robin manner. This feature is useful for balancing the load among multiple `vmselect` and/or `vminsert` nodes in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html).
## Auth config
Auth config is represented in the following simple `yml` format:
`-auth.config` is represented in the following simple `yml` format:
```yml
@@ -36,43 +42,71 @@ Auth config is represented in the following simple `yml` format:
# Usernames must be unique.
users:
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
- bearer_token: "XXXX"
url_prefix: "http://localhost:8428"
# The user for querying local single-node VictoriaMetrics.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
# will be proxied to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# The user for querying local single-node VictoriaMetrics with extra_label team=dev.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
# will be routed to http://localhost:8428 with extra_label=team=dev query arg.
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query?extra_label=team=dev
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428?extra_label=team=dev"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be load-balanced among http://vmselect1:8481/select/123/prometheus and http://vmselect2:8481/select/123/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/123/prometheus/api/v1/select
# - http://vmselect2:8481/select/123/prometheus/api/v1/select
- username: "cluster-select-account-123"
password: "***"
url_prefix: "http://vmselect:8481/select/123/prometheus"
url_prefix:
- "http://vmselect1:8481/select/123/prometheus"
- "http://vmselect2:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
# will be load-balanced between http://vminsert1:8480/insert/42/prometheus and http://vminsert2:8480/insert/42/prometheus
# For example, http://vmauth:8427/api/v1/write is proxied to the following urls in a round-robin manner:
# - http://vminsert1:8480/insert/42/prometheus/api/v1/write
# - http://vminsert2:8480/insert/42/prometheus/api/v1/write
- username: "cluster-insert-account-42"
password: "***"
url_prefix: "http://vminsert:8480/insert/42/prometheus"
url_prefix:
- "http://vminsert1:8480/insert/42/prometheus"
- "http://vminsert2:8480/insert/42/prometheus"
# A single user for querying and inserting data:
# - Requests to http://vmauth:8427/api/v1/query or http://vmauth:8427/api/v1/query_range
# are routed to http://vmselect:8481/select/42/prometheus.
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8480/select/42/prometheus/api/v1/query
# - Requests to http://vmauth:8427/api/v1/write are routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/42/prometheus
# - http://vmselect2:8481/select/42/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write
- username: "foobar"
url_map:
- src_paths: ["/api/v1/query", "/api/v1/query_range"]
url_prefix: "http://vmselect:8481/select/42/prometheus"
- src_paths:
- "/api/v1/query"
- "/api/v1/query_range"
- "/api/v1/label/[^/]+/values"
url_prefix:
- "http://vmselect1:8481/select/42/prometheus"
- "http://vmselect2:8481/select/42/prometheus"
- src_paths: ["/api/v1/write"]
url_prefix: "http://vminsert:8480/insert/42/prometheus"
```
@@ -96,11 +130,13 @@ Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
It is recommended protecting `/-/reload` endpoint with `-reloadAuthKey` command-line flag, so external users couldn't trigger config reload.
## Monitoring
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
either via [vmagent](https://victoriametrics.github.io/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
either via [vmagent](https://docs.victoriametrics.com/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
## How to build from sources
@@ -110,14 +146,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmauth` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmauth` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmauth` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmauth-prod` from the root folder of the repository.
2. Run `make vmauth-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmauth-prod` binary and puts it into the `bin` folder.
### Building docker images
@@ -164,12 +200,12 @@ Pass `-help` command-line arg to `vmauth` in order to see all the configuration
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmauth.html .
See the docs at https://docs.victoriametrics.com/vmauth.html .
-auth.config string
Path to auth config. See https://victoriametrics.github.io/vmauth.html for details on the format of this auth config
Path to auth config. See https://docs.victoriametrics.com/vmauth.html for details on the format of this auth config
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
@@ -177,17 +213,17 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
@@ -197,7 +233,7 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
@@ -207,20 +243,24 @@ See the docs at https://victoriametrics.github.io/vmauth.html .
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxIdleConnsPerBackend int
The maximum number of idle connections vmauth can open per each backend host (default 100)
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-reloadAuthKey string
Auth key for /-/reload http endpoint. It must be passed as authKey=...
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version

View File

@@ -1,10 +1,14 @@
package main
import (
"encoding/base64"
"flag"
"fmt"
"io/ioutil"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -17,7 +21,7 @@ import (
)
var (
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://victoriametrics.github.io/vmauth.html "+
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://docs.victoriametrics.com/vmauth.html "+
"for details on the format of this auth config")
)
@@ -28,24 +32,140 @@ type AuthConfig struct {
// UserInfo is user information read from authConfigPath
type UserInfo struct {
Username string `yaml:"username"`
Password string `yaml:"password"`
URLPrefix string `yaml:"url_prefix"`
URLMap []URLMap `yaml:"url_map"`
BearerToken string `yaml:"bearer_token"`
Username string `yaml:"username"`
Password string `yaml:"password"`
URLPrefix *URLPrefix `yaml:"url_prefix"`
URLMap []URLMap `yaml:"url_map"`
requests *metrics.Counter
}
// URLMap is a mapping from source paths to target urls.
type URLMap struct {
SrcPaths []string `yaml:"src_paths"`
URLPrefix string `yaml:"url_prefix"`
SrcPaths []*SrcPath `yaml:"src_paths"`
URLPrefix *URLPrefix `yaml:"url_prefix"`
}
// SrcPath represents an src path
type SrcPath struct {
sOriginal string
re *regexp.Regexp
}
// URLPrefix represents pased `url_prefix`
type URLPrefix struct {
n uint32
urls []*url.URL
}
func (up *URLPrefix) getNextURL() *url.URL {
n := atomic.AddUint32(&up.n, 1)
idx := n % uint32(len(up.urls))
return up.urls[idx]
}
// UnmarshalYAML unmarshals up from yaml.
func (up *URLPrefix) UnmarshalYAML(f func(interface{}) error) error {
var v interface{}
if err := f(&v); err != nil {
return err
}
var urls []string
switch x := v.(type) {
case string:
urls = []string{x}
case []interface{}:
if len(x) == 0 {
return fmt.Errorf("`url_prefix` must contain at least a single url")
}
us := make([]string, len(x))
for i, xx := range x {
s, ok := xx.(string)
if !ok {
return fmt.Errorf("`url_prefix` must contain array of strings; got %T", xx)
}
us[i] = s
}
urls = us
default:
return fmt.Errorf("unexpected type for `url_prefix`: %T; want string or []string", v)
}
pus := make([]*url.URL, len(urls))
for i, u := range urls {
pu, err := url.Parse(u)
if err != nil {
return fmt.Errorf("cannot unmarshal %q into url: %w", u, err)
}
pus[i] = pu
}
up.urls = pus
return nil
}
// MarshalYAML marshals up to yaml.
func (up *URLPrefix) MarshalYAML() (interface{}, error) {
var b []byte
if len(up.urls) == 1 {
u := up.urls[0].String()
b = strconv.AppendQuote(b, u)
return string(b), nil
}
b = append(b, '[')
for i, pu := range up.urls {
u := pu.String()
b = strconv.AppendQuote(b, u)
if i+1 < len(up.urls) {
b = append(b, ',')
}
}
b = append(b, ']')
return string(b), nil
}
func (sp *SrcPath) match(s string) bool {
prefix, ok := sp.re.LiteralPrefix()
if ok {
// Fast path - literal match
return s == prefix
}
if !strings.HasPrefix(s, prefix) {
return false
}
return sp.re.MatchString(s)
}
// UnmarshalYAML implements yaml.Unmarshaler
func (sp *SrcPath) UnmarshalYAML(f func(interface{}) error) error {
var s string
if err := f(&s); err != nil {
return err
}
sAnchored := "^(?:" + s + ")$"
re, err := regexp.Compile(sAnchored)
if err != nil {
return fmt.Errorf("cannot build regexp from %q: %w", s, err)
}
sp.sOriginal = s
sp.re = re
return nil
}
// MarshalYAML implements yaml.Marshaler.
func (sp *SrcPath) MarshalYAML() (interface{}, error) {
return sp.sOriginal, nil
}
func initAuthConfig() {
if len(*authConfigPath) == 0 {
logger.Fatalf("missing required `-auth.config` command-line flag")
}
// Register SIGHUP handler for config re-read just before readAuthConfig call.
// This guarantees that the config will be re-read if the signal arrives during readAuthConfig call.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
sighupCh := procutil.NewSighupChan()
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
@@ -55,7 +175,7 @@ func initAuthConfig() {
authConfigWG.Add(1)
go func() {
defer authConfigWG.Done()
authConfigReloader()
authConfigReloader(sighupCh)
}()
}
@@ -64,8 +184,7 @@ func stopAuthConfig() {
authConfigWG.Wait()
}
func authConfigReloader() {
sighupCh := procutil.NewSighupChan()
func authConfigReloader(sighupCh <-chan os.Signal) {
for {
select {
case <-stopCh:
@@ -110,58 +229,93 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
if len(uis) == 0 {
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
}
m := make(map[string]*UserInfo, len(uis))
byAuthToken := make(map[string]*UserInfo, len(uis))
byUsername := make(map[string]bool, len(uis))
byBearerToken := make(map[string]bool, len(uis))
for i := range uis {
ui := &uis[i]
if m[ui.Username] != nil {
if ui.BearerToken == "" && ui.Username == "" {
return nil, fmt.Errorf("either bearer_token or username must be set")
}
if ui.BearerToken != "" && ui.Username != "" {
return nil, fmt.Errorf("bearer_token=%q and username=%q cannot be set simultaneously", ui.BearerToken, ui.Username)
}
if byBearerToken[ui.BearerToken] {
return nil, fmt.Errorf("duplicate bearer_token found; bearer_token: %q", ui.BearerToken)
}
if byUsername[ui.Username] {
return nil, fmt.Errorf("duplicate username found; username: %q", ui.Username)
}
if len(ui.URLPrefix) > 0 {
urlPrefix, err := sanitizeURLPrefix(ui.URLPrefix)
if err != nil {
authToken := getAuthToken(ui.BearerToken, ui.Username, ui.Password)
if byAuthToken[authToken] != nil {
return nil, fmt.Errorf("duplicate auth token found for bearer_token=%q, username=%q: %q", authToken, ui.BearerToken, ui.Username)
}
if ui.URLPrefix != nil {
if err := ui.URLPrefix.sanitize(); err != nil {
return nil, err
}
ui.URLPrefix = urlPrefix
}
for _, e := range ui.URLMap {
if len(e.SrcPaths) == 0 {
return nil, fmt.Errorf("missing `src_paths`")
return nil, fmt.Errorf("missing `src_paths` in `url_map`")
}
for _, path := range e.SrcPaths {
if !strings.HasPrefix(path, "/") {
return nil, fmt.Errorf("`src_path`=%q must start with `/`", path)
}
if e.URLPrefix == nil {
return nil, fmt.Errorf("missing `url_prefix` in `url_map`")
}
urlPrefix, err := sanitizeURLPrefix(e.URLPrefix)
if err != nil {
if err := e.URLPrefix.sanitize(); err != nil {
return nil, err
}
e.URLPrefix = urlPrefix
}
if len(ui.URLMap) == 0 && len(ui.URLPrefix) == 0 {
if len(ui.URLMap) == 0 && ui.URLPrefix == nil {
return nil, fmt.Errorf("missing `url_prefix`")
}
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
m[ui.Username] = ui
if ui.BearerToken != "" {
if ui.Password != "" {
return nil, fmt.Errorf("password shouldn't be set for bearer_token %q", ui.BearerToken)
}
ui.requests = metrics.GetOrCreateCounter(`vmauth_user_requests_total{username="bearer_token"}`)
byBearerToken[ui.BearerToken] = true
}
if ui.Username != "" {
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
byUsername[ui.Username] = true
}
byAuthToken[authToken] = ui
}
return m, nil
return byAuthToken, nil
}
func sanitizeURLPrefix(urlPrefix string) (string, error) {
func getAuthToken(bearerToken, username, password string) string {
if bearerToken != "" {
return "Bearer " + bearerToken
}
token := username + ":" + password
token64 := base64.StdEncoding.EncodeToString([]byte(token))
return "Basic " + token64
}
func (up *URLPrefix) sanitize() error {
for i, pu := range up.urls {
puNew, err := sanitizeURLPrefix(pu)
if err != nil {
return err
}
up.urls[i] = puNew
}
return nil
}
func sanitizeURLPrefix(urlPrefix *url.URL) (*url.URL, error) {
// Remove trailing '/' from urlPrefix
for strings.HasSuffix(urlPrefix, "/") {
urlPrefix = urlPrefix[:len(urlPrefix)-1]
for strings.HasSuffix(urlPrefix.Path, "/") {
urlPrefix.Path = urlPrefix.Path[:len(urlPrefix.Path)-1]
}
// Validate urlPrefix
target, err := url.Parse(urlPrefix)
if err != nil {
return "", fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
if urlPrefix.Scheme != "http" && urlPrefix.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, urlPrefix.Scheme)
}
if target.Scheme != "http" && target.Scheme != "https" {
return "", fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
}
if target.Host == "" {
return "", fmt.Errorf("missing hostname in `url_prefix %q`", urlPrefix)
if urlPrefix.Host == "" {
return nil, fmt.Errorf("missing hostname in `url_prefix %q`", urlPrefix.Host)
}
return urlPrefix, nil
}

View File

@@ -1,8 +1,13 @@
package main
import (
"reflect"
"bytes"
"fmt"
"net/url"
"regexp"
"testing"
"gopkg.in/yaml.v2"
)
func TestParseAuthConfigFailure(t *testing.T) {
@@ -51,6 +56,41 @@ users:
- username: foo
url_prefix: http:///bar
`)
f(`
users:
- username: foo
url_prefix:
bar: baz
`)
f(`
users:
- username: foo
url_prefix:
- [foo]
`)
// empty url_prefix
f(`
users:
- username: foo
url_prefix: []
`)
// Username and bearer_token in a single config
f(`
users:
- username: foo
bearer_token: bbb
url_prefix: http://foo.bar
`)
// Bearer_token and password in a single config
f(`
users:
- password: foo
bearer_token: bbb
url_prefix: http://foo.bar
`)
// Duplicate users
f(`
@@ -63,6 +103,17 @@ users:
url_prefix: https://sss.sss
`)
// Duplicate bearer_tokens
f(`
users:
- bearer_token: foo
url_prefix: http://foo.bar
- username: bar
url_prefix: http://xxx.yyy
- bearer_token: foo
url_prefix: https://sss.sss
`)
// Missing url_prefix in url_map
f(`
users:
@@ -71,6 +122,24 @@ users:
- src_paths: ["/foo/bar"]
`)
// Invalid url_prefix in url_map
f(`
users:
- username: a
url_map:
- src_paths: ["/foo/bar"]
url_prefix: foo.bar
`)
// empty url_prefix in url_map
f(`
users:
- username: a
url_map:
- src_paths: ['/foo/bar']
url_prefix: []
`)
// Missing src_paths in url_map
f(`
users:
@@ -79,12 +148,12 @@ users:
- url_prefix: http://foobar
`)
// src_path not starting with `/`
// Invalid regexp in src_path.
f(`
users:
- username: a
url_map:
- src_paths: [foobar]
- src_paths: ['fo[obar']
url_prefix: http://foobar
`)
}
@@ -97,8 +166,8 @@ func TestParseAuthConfigSuccess(t *testing.T) {
t.Fatalf("unexpected error: %s", err)
}
removeMetrics(m)
if !reflect.DeepEqual(m, expectedAuthConfig) {
t.Fatalf("unexpected auth config\ngot\n%v\nwant\n%v", m, expectedAuthConfig)
if err := areEqualConfigs(m, expectedAuthConfig); err != nil {
t.Fatal(err)
}
}
@@ -109,10 +178,29 @@ users:
password: bar
url_prefix: http://aaa:343/bbb
`, map[string]*UserInfo{
"foo": {
getAuthToken("", "foo", "bar"): {
Username: "foo",
Password: "bar",
URLPrefix: "http://aaa:343/bbb",
URLPrefix: mustParseURL("http://aaa:343/bbb"),
},
})
// Multiple url_prefix entries
f(`
users:
- username: foo
password: bar
url_prefix:
- http://node1:343/bbb
- http://node2:343/bbb
`, map[string]*UserInfo{
getAuthToken("", "foo", "bar"): {
Username: "foo",
Password: "bar",
URLPrefix: mustParseURLs([]string{
"http://node1:343/bbb",
"http://node2:343/bbb",
}),
},
})
@@ -124,44 +212,91 @@ users:
- username: bar
url_prefix: https://bar/x///
`, map[string]*UserInfo{
"foo": {
getAuthToken("", "foo", ""): {
Username: "foo",
URLPrefix: "http://foo",
URLPrefix: mustParseURL("http://foo"),
},
"bar": {
getAuthToken("", "bar", ""): {
Username: "bar",
URLPrefix: "https://bar/x",
URLPrefix: mustParseURL("https://bar/x"),
},
})
// non-empty URLMap
f(`
users:
- username: foo
- bearer_token: foo
url_map:
- src_paths: ["/api/v1/query","/api/v1/query_range"]
- src_paths: ["/api/v1/query","/api/v1/query_range","/api/v1/label/[^./]+/.+"]
url_prefix: http://vmselect/select/0/prometheus
- src_paths: ["/api/v1/write"]
url_prefix: http://vminsert/insert/0/prometheus
url_prefix: ["http://vminsert1/insert/0/prometheus","http://vminsert2/insert/0/prometheus"]
`, map[string]*UserInfo{
"foo": {
Username: "foo",
getAuthToken("foo", "", ""): {
BearerToken: "foo",
URLMap: []URLMap{
{
SrcPaths: []string{"/api/v1/query", "/api/v1/query_range"},
URLPrefix: "http://vmselect/select/0/prometheus",
SrcPaths: getSrcPaths([]string{"/api/v1/query", "/api/v1/query_range", "/api/v1/label/[^./]+/.+"}),
URLPrefix: mustParseURL("http://vmselect/select/0/prometheus"),
},
{
SrcPaths: []string{"/api/v1/write"},
URLPrefix: "http://vminsert/insert/0/prometheus",
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
URLPrefix: mustParseURLs([]string{
"http://vminsert1/insert/0/prometheus",
"http://vminsert2/insert/0/prometheus",
}),
},
},
},
})
}
func getSrcPaths(paths []string) []*SrcPath {
var sps []*SrcPath
for _, path := range paths {
sps = append(sps, &SrcPath{
sOriginal: path,
re: regexp.MustCompile("^(?:" + path + ")$"),
})
}
return sps
}
func removeMetrics(m map[string]*UserInfo) {
for _, info := range m {
info.requests = nil
}
}
func areEqualConfigs(a, b map[string]*UserInfo) error {
aData, err := yaml.Marshal(a)
if err != nil {
return fmt.Errorf("cannot marshal a: %w", err)
}
bData, err := yaml.Marshal(b)
if err != nil {
return fmt.Errorf("cannot marshal b: %w", err)
}
if !bytes.Equal(aData, bData) {
return fmt.Errorf("unexpected configs;\ngot\n%s\nwant\n%s", aData, bData)
}
return nil
}
func mustParseURL(u string) *URLPrefix {
return mustParseURLs([]string{u})
}
func mustParseURLs(us []string) *URLPrefix {
pus := make([]*url.URL, len(us))
for i, u := range us {
pu, err := url.Parse(u)
if err != nil {
panic(fmt.Errorf("BUG: cannot parse %q: %w", u, err))
}
pus[i] = pu
}
return &URLPrefix{
urls: pus,
}
}

View File

@@ -2,30 +2,70 @@
# Usernames must be unique.
users:
# Requests with the 'Authorization: Bearer XXXX' header are proxied to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
- bearer_token: "XXXX"
url_prefix: "http://localhost:8428"
# The user for querying local single-node VictoriaMetrics.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
# will be proxied to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is proxied to http://localhost:8428/api/v1/query
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# The user for querying local single-node VictoriaMetrics with extra_label team=dev.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
# will be routed to http://localhost:8428 with extra_label=team=dev query arg.
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query?extra_label=team=dev
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428?extra_label=team=dev"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be load-balanced among http://vmselect1:8481/select/123/prometheus and http://vmselect2:8481/select/123/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/123/prometheus/api/v1/select
# - http://vmselect2:8481/select/123/prometheus/api/v1/select
- username: "cluster-select-account-123"
password: "***"
url_prefix: "http://vmselect:8481/select/123/prometheus"
url_prefix:
- "http://vmselect1:8481/select/123/prometheus"
- "http://vmselect2:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
# See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be load-balanced between http://vminsert1:8480/insert/42/prometheus and http://vminsert2:8480/insert/42/prometheus
# For example, http://vmauth:8427/api/v1/write is proxied to the following urls in a round-robin manner:
# - http://vminsert1:8480/insert/42/prometheus/api/v1/write
# - http://vminsert2:8480/insert/42/prometheus/api/v1/write
- username: "cluster-insert-account-42"
password: "***"
url_prefix: "http://vminsert:8480/insert/42/prometheus"
url_prefix:
- "http://vminsert1:8480/insert/42/prometheus"
- "http://vminsert2:8480/insert/42/prometheus"
# A single user for querying and inserting data:
# - Requests to http://vmauth:8427/api/v1/query, http://vmauth:8427/api/v1/query_range
# and http://vmauth:8427/api/v1/label/<label_name>/values are proxied to the following urls in a round-robin manner:
# - http://vmselect1:8481/select/42/prometheus
# - http://vmselect2:8481/select/42/prometheus
# For example, http://vmauth:8427/api/v1/query is proxied to http://vmselect1:8480/select/42/prometheus/api/v1/query
# or to http://vmselect2:8480/select/42/prometheus/api/v1/query .
# - Requests to http://vmauth:8427/api/v1/write are proxied to http://vminsert:8480/insert/42/prometheus/api/v1/write
- username: "foobar"
url_map:
- src_paths:
- "/api/v1/query"
- "/api/v1/query_range"
- "/api/v1/label/[^/]+/values"
url_prefix:
- "http://vmselect1:8481/select/42/prometheus"
- "http://vmselect2:8481/select/42/prometheus"
- src_paths: ["/api/v1/write"]
url_prefix: "http://vminsert:8480/insert/42/prometheus"

View File

@@ -14,10 +14,13 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
)
var (
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
maxIdleConnsPerBackend = flag.Int("maxIdleConnsPerBackend", 100, "The maximum number of idle connections vmauth can open per each backend host")
reloadAuthKey = flag.String("reloadAuthKey", "", "Auth key for /-/reload http endpoint. It must be passed as authKey=...")
)
func main() {
@@ -47,16 +50,28 @@ func main() {
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
username, password, ok := r.BasicAuth()
if !ok {
switch r.URL.Path {
case "/-/reload":
authKey := r.FormValue("authKey")
if authKey != *reloadAuthKey {
httpserver.Errorf(w, r, "invalid authKey %q. It must match the value from -reloadAuthKey command line flag", authKey)
return true
}
configReloadRequests.Inc()
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
}
authToken := r.Header.Get("Authorization")
if authToken == "" {
w.Header().Set("WWW-Authenticate", `Basic realm="Restricted"`)
http.Error(w, "missing `Authorization: Basic *` header", http.StatusUnauthorized)
http.Error(w, "missing `Authorization` request header", http.StatusUnauthorized)
return true
}
ac := authConfig.Load().(map[string]*UserInfo)
ui := ac[username]
if ui == nil || ui.Password != password {
httpserver.Errorf(w, r, "cannot find the provided username %q or password in config", username)
ui := ac[authToken]
if ui == nil {
httpserver.Errorf(w, r, "cannot find the provided auth token %q in config", authToken)
return true
}
ui.requests.Inc()
@@ -65,15 +80,27 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
httpserver.Errorf(w, r, "cannot determine targetURL: %s", err)
return true
}
if _, err := url.Parse(targetURL); err != nil {
httpserver.Errorf(w, r, "invalid targetURL=%q: %s", targetURL, err)
return true
}
r.Header.Set("vm-target-url", targetURL)
reverseProxy.ServeHTTP(w, r)
r.Header.Set("vm-target-url", targetURL.String())
proxyRequest(w, r)
return true
}
func proxyRequest(w http.ResponseWriter, r *http.Request) {
defer func() {
err := recover()
if err == nil || err == http.ErrAbortHandler {
// Suppress http.ErrAbortHandler panic.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1353
return
}
// Forward other panics to the caller.
panic(err)
}()
reverseProxy.ServeHTTP(w, r)
}
var configReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
var reverseProxy = &httputil.ReverseProxy{
Director: func(r *http.Request) {
targetURL := r.Header.Get("vm-target-url")
@@ -89,6 +116,7 @@ var reverseProxy = &httputil.ReverseProxy{
tr.DisableCompression = true
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
tr.ForceAttemptHTTP2 = false
tr.MaxIdleConnsPerHost = *maxIdleConnsPerBackend
return tr
}(),
FlushInterval: time.Second,
@@ -99,7 +127,7 @@ func usage() {
const s = `
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmauth.html .
See the docs at https://docs.victoriametrics.com/vmauth.html .
`
flagutil.Usage(s)
}

View File

@@ -7,25 +7,50 @@ import (
"strings"
)
func createTargetURL(ui *UserInfo, uOrig *url.URL) (string, error) {
u, err := url.Parse(uOrig.String())
if err != nil {
return "", fmt.Errorf("cannot make a copy of %q: %w", u, err)
func (up *URLPrefix) mergeURLs(requestURI *url.URL) *url.URL {
pu := up.getNextURL()
return mergeURLs(pu, requestURI)
}
func mergeURLs(uiURL, requestURI *url.URL) *url.URL {
targetURL := *uiURL
targetURL.Path += requestURI.Path
requestParams := requestURI.Query()
// fast path
if len(requestParams) == 0 {
return &targetURL
}
// merge query parameters from requests.
uiParams := targetURL.Query()
for k, v := range requestParams {
// skip clashed query params from original request
if exist := uiParams.Get(k); len(exist) > 0 {
continue
}
for i := range v {
uiParams.Add(k, v[i])
}
}
targetURL.RawQuery = uiParams.Encode()
return &targetURL
}
func createTargetURL(ui *UserInfo, uOrig *url.URL) (*url.URL, error) {
u := *uOrig
// Prevent from attacks with using `..` in r.URL.Path
u.Path = path.Clean(u.Path)
if !strings.HasPrefix(u.Path, "/") {
u.Path = "/" + u.Path
}
for _, e := range ui.URLMap {
for _, path := range e.SrcPaths {
if u.Path == path {
return e.URLPrefix + u.RequestURI(), nil
for _, sp := range e.SrcPaths {
if sp.match(u.Path) {
return e.URLPrefix.mergeURLs(&u), nil
}
}
}
if len(ui.URLPrefix) > 0 {
return ui.URLPrefix + u.RequestURI(), nil
if ui.URLPrefix != nil {
return ui.URLPrefix.mergeURLs(&u), nil
}
return "", fmt.Errorf("missing route for %q", u)
return nil, fmt.Errorf("missing route for %q", u.String())
}

View File

@@ -16,47 +16,74 @@ func TestCreateTargetURLSuccess(t *testing.T) {
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
if target != expectedTarget {
if target.String() != expectedTarget {
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
}
}
// Simple routing with `url_prefix`
f(&UserInfo{
URLPrefix: "http://foo.bar",
URLPrefix: mustParseURL("http://foo.bar"),
}, "", "http://foo.bar/.")
f(&UserInfo{
URLPrefix: "http://foo.bar",
URLPrefix: mustParseURL("http://foo.bar"),
}, "/", "http://foo.bar/")
f(&UserInfo{
URLPrefix: "http://foo.bar",
URLPrefix: mustParseURL("http://foo.bar"),
}, "a/b?c=d", "http://foo.bar/a/b?c=d")
f(&UserInfo{
URLPrefix: "https://sss:3894/x/y",
URLPrefix: mustParseURL("https://sss:3894/x/y"),
}, "/z", "https://sss:3894/x/y/z")
f(&UserInfo{
URLPrefix: "https://sss:3894/x/y",
URLPrefix: mustParseURL("https://sss:3894/x/y"),
}, "/../../aaa", "https://sss:3894/x/y/aaa")
f(&UserInfo{
URLPrefix: "https://sss:3894/x/y",
}, "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
URLPrefix: mustParseURL("https://sss:3894/x/y"),
}, "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s%2F..%2Fd")
// Complex routing with `url_map`
ui := &UserInfo{
URLMap: []URLMap{
{
SrcPaths: []string{"/api/v1/query"},
URLPrefix: "http://vmselect/0/prometheus",
SrcPaths: getSrcPaths([]string{"/api/v1/query"}),
URLPrefix: mustParseURL("http://vmselect/0/prometheus"),
},
{
SrcPaths: []string{"/api/v1/write"},
URLPrefix: "http://vminsert/0/prometheus",
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
URLPrefix: mustParseURL("http://vminsert/0/prometheus"),
},
},
URLPrefix: "http://default-server",
URLPrefix: mustParseURL("http://default-server"),
}
f(ui, "/api/v1/query?query=up", "http://vmselect/0/prometheus/api/v1/query?query=up")
f(ui, "/api/v1/write", "http://vminsert/0/prometheus/api/v1/write")
f(ui, "/api/v1/query_range", "http://default-server/api/v1/query_range")
// Complex routing regexp paths in `url_map`
ui = &UserInfo{
URLMap: []URLMap{
{
SrcPaths: getSrcPaths([]string{"/api/v1/query(_range)?", "/api/v1/label/[^/]+/values"}),
URLPrefix: mustParseURL("http://vmselect/0/prometheus"),
},
{
SrcPaths: getSrcPaths([]string{"/api/v1/write"}),
URLPrefix: mustParseURL("http://vminsert/0/prometheus"),
},
},
URLPrefix: mustParseURL("http://default-server"),
}
f(ui, "/api/v1/query?query=up", "http://vmselect/0/prometheus/api/v1/query?query=up")
f(ui, "/api/v1/query_range?query=up", "http://vmselect/0/prometheus/api/v1/query_range?query=up")
f(ui, "/api/v1/label/foo/values", "http://vmselect/0/prometheus/api/v1/label/foo/values")
f(ui, "/api/v1/write", "http://vminsert/0/prometheus/api/v1/write")
f(ui, "/api/v1/foo/bar", "http://default-server/api/v1/foo/bar")
f(&UserInfo{
URLPrefix: mustParseURL("http://foo.bar?extra_label=team=dev"),
}, "/api/v1/query", "http://foo.bar/api/v1/query?extra_label=team=dev")
f(&UserInfo{
URLPrefix: mustParseURL("http://foo.bar?extra_label=team=mobile"),
}, "/api/v1/query?extra_label=team=dev", "http://foo.bar/api/v1/query?extra_label=team%3Dmobile")
}
func TestCreateTargetURLFailure(t *testing.T) {
@@ -70,7 +97,7 @@ func TestCreateTargetURLFailure(t *testing.T) {
if err == nil {
t.Fatalf("expecting non-nil error")
}
if target != "" {
if target != nil {
t.Fatalf("unexpected target=%q; want empty string", target)
}
}
@@ -78,8 +105,8 @@ func TestCreateTargetURLFailure(t *testing.T) {
f(&UserInfo{
URLMap: []URLMap{
{
SrcPaths: []string{"/api/v1/query"},
URLPrefix: "http://foobar/baz",
SrcPaths: getSrcPaths([]string{"/api/v1/query"}),
URLPrefix: mustParseURL("http://foobar/baz"),
},
},
}, "/api/v1/write")

View File

@@ -1,6 +1,6 @@
## vmbackup
# vmbackup
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
Supported storage systems for backups:
@@ -15,7 +15,7 @@ data between the existing backup and new backup. It saves time and costs on data
Backup process can be interrupted at any time. It is automatically resumed from the interruption point when restarting `vmbackup` with the same args.
Backed up data can be restored with [vmrestore](https://victoriametrics.github.io/vmrestore.html).
Backed up data can be restored with [vmrestore](https://docs.victoriametrics.com/vmrestore.html).
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
@@ -34,8 +34,8 @@ vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-
```
* `</path/to/victoria-metrics-data>` - path to VictoriaMetrics data pointed by `-storageDataPath` command-line flag in single-node VictoriaMetrics or in cluster `vmstorage`.
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots). `vmbackup` can create the snapshot on itself if `-snapshot.createURL` command-line flag is set to an url for creating snapshots. In this case `-snapshotName` flag isn't needed.
* `<bucket>` is an already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
* `<path/to/new/backup>` is the destination path where new backup will be placed.
@@ -72,7 +72,7 @@ Smart backups mean storing full daily backups into `YYYYMMDD` folders and creati
vmbackup -snapshotName=<latest-snapshot> -dst=gcs://<bucket>/latest
```
Where `<latest-snapshot>` is the latest [snapshot](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
Where `<latest-snapshot>` is the latest [snapshot](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
The command will upload only changed data to `gcs://<bucket>/latest`.
* Run the following command once a day:
@@ -123,8 +123,8 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
* Backups created from [single-node VictoriaMetrics](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html) cannot be restored
at [cluster VictoriaMetrics](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html) and vice versa.
* Backups created from [single-node VictoriaMetrics](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html) cannot be restored
at [cluster VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) and vice versa.
## Advanced usage
@@ -194,7 +194,7 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
@@ -204,23 +204,23 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
-maxBytesPerSecond value
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-maxBytesPerSecond size
The maximum upload speed. There is no limit if it is set to 0
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-origin string
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set
-snapshot.deleteURL string
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete
-snapshotName string
Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots
Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set
-storageDataPath string
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
-version
@@ -235,14 +235,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmbackup` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmbackup` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmbackup` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmbackup-prod` from the root folder of the repository.
2. Run `make vmbackup-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmbackup-prod` binary and puts it into the `bin` folder.
### Building docker images

View File

@@ -19,11 +19,11 @@ import (
var (
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots. There is no need in setting -snapshotName if -snapshot.createURL is set")
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. "+
"Example: http://victoriametrics:8428/snaphsot/create")
"Example: http://victoriametrics:8428/snapshot/create . There is no need in setting -snapshotName if -snapshot.createURL is set")
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. "+
"All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete")
"All created snapshots will be automatically deleted. Example: http://victoriametrics:8428/snapshot/delete")
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
@@ -41,7 +41,9 @@ func main() {
logger.Init()
if len(*snapshotCreateURL) > 0 {
logger.Infof("Snapshots enabled")
if len(*snapshotName) > 0 {
logger.Fatalf("-snapshotName shouldn't be set if -snapshot.createURL is set, since snapshots are created automatically in this case")
}
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
if len(*snapshotDeleteURL) <= 0 {
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
@@ -99,7 +101,7 @@ func usage() {
vmbackup performs backups for VictoriaMetrics data from instant snapshots to gcs, s3
or local filesystem. Backed up data can be restored with vmrestore.
See the docs at https://victoriametrics.github.io/vbackup.html .
See the docs at https://docs.victoriametrics.com/vmbackup.html .
`
flagutil.Usage(s)
}

View File

@@ -1,13 +1,15 @@
## Victoria Metrics Backup Manager
## vmbackupmanager
This service automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed.
***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/enterprise.html)***
The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed.
The backup service makes a backup every hour and puts it to the latest folder and then copies data to the folders which represent the backup intervals (hourly, daily, weekly and monthly)
The required flags for running the service are as follows:
* -eula - should be true and means that you have the legal right to run a backup manager. That can either be a signed contract or an email with confirmation to run the service in a trial period
* -storageDataPath - path to VictoriaMetrics or vmstorage data path to make backup from
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
* -snapshot.createURL - VictoriaMetrics creates snapshot URL which will automatically be created during backup. Example: http://victoriametrics:8428/snapshot/create
* -dst - backup destination at s3, gcs or local filesystem
* -credsFilePath - path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set. See [https://cloud.google.com/iam/docs/creating-managing-service-account-keys](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) and [https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html](https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html)
@@ -47,7 +49,7 @@ There are two flags which could help with performance tuning:
* -concurrency - The number of concurrent workers. Higher concurrency may improve upload speed (default 10)
### Example of Usage
## Example of Usage
GCS and cluster version. You need to have a credentials file in json format with following structure

View File

@@ -71,3 +71,9 @@ vmctl-local-with-goarch:
vmctl-pure:
APP_NAME=vmctl $(MAKE) app-local-pure
vmctl-windows-amd64:
GOARCH=amd64 APP_NAME=vmctl $(MAKE) app-local-windows-with-goarch
vmctl-windows-amd64-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-windows-amd64

View File

@@ -1,41 +1,15 @@
# vmctl
Victoria metrics command-line tool
VictoriaMetrics command-line tool
Features:
- [x] Prometheus: migrate data from Prometheus to VictoriaMetrics using snapshot API
- [x] Thanos: migrate data from Thanos to VictoriaMetrics
- [ ] ~~Prometheus: migrate data from Prometheus to VictoriaMetrics by query~~(discarded)
- [x] InfluxDB: migrate data from InfluxDB to VictoriaMetrics
- [x] OpenTSDB: migrate data from OpenTSDB to VictoriaMetrics
- [ ] Storage Management: data re-balancing between nodes
# Table of contents
* [Articles](#articles)
* [How to build](#how-to-build)
* [Migrating data from InfluxDB 1.x](#migrating-data-from-influxdb-1x)
* [Data mapping](#data-mapping)
* [Configuration](#configuration)
* [Filtering](#filtering)
* [Migrating data from InfluxDB 2.x](#migrating-data-from-influxdb-2x)
* [Migrating data from Prometheus](#migrating-data-from-prometheus)
* [Data mapping](#data-mapping-1)
* [Configuration](#configuration-1)
* [Filtering](#filtering-1)
* [Migrating data from Thanos](#migrating-data-from-thanos)
* [Current data](#current-data)
* [Historical data](#historical-data)
* [Migrating data from VictoriaMetrics](#migrating-data-from-victoriametrics)
* [Native protocol](#native-protocol)
* [Tuning](#tuning)
* [Influx mode](#influx-mode)
* [Prometheus mode](#prometheus-mode)
* [VictoriaMetrics importer](#victoriametrics-importer)
* [Importer stats](#importer-stats)
* [Significant figures](#significant-figures)
* [Adding extra labels](#adding-extra-labels)
## Articles
* [How to migrate data from Prometheus](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043)
@@ -48,14 +22,14 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmctl` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmctl` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmctl` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmctl-prod` from the root folder of the repository.
2. Run `make vmctl-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmctl-prod` binary and puts it into the `bin` folder.
### Building docker images
@@ -77,16 +51,121 @@ ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://b
#### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of the repository.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.16.
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmctl-arm` or `vmctl-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmctl-arm-prod` or `make vmctl-arm64-prod` from the root folder of the repository.
2. Run `make vmctl-arm-prod` or `make vmctl-arm64-prod` from the root folder of [the repository](https://github.com/VictoriaMetrics/VictoriaMetrics).
It builds `vmctl-arm-prod` or `vmctl-arm64-prod` binary respectively and puts it into the `bin` folder.
## Migrating data from OpenTSDB
`vmctl` supports the `opentsdb` mode to migrate data from OpenTSDB to VictoriaMetrics time-series database.
See `./vmctl opentsdb --help` for details and full list of flags.
*OpenTSDB migration is not possible without a functioning [meta](http://opentsdb.net/docs/build/html/user_guide/metadata.html) table to search for metrics/series.*
OpenTSDB migration works like so:
1. Find metrics based on selected filters (or the default filter set ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
* e.g. `curl -Ss "http://opentsdb:4242/api/suggest?type=metrics&q=sys"`
2. Find series associated with each returned metric
* e.g. `curl -Ss "http://opentsdb:4242/api/search/lookup?m=system.load5&limit=1000000"`
3. Download data for each series in chunks defined in the CLI switches
* e.g. `-retention=sum-1m-avg:1h:90d` ==
* `curl -Ss "http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:system.load5\{host=host1\}"`
* `curl -Ss "http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
* `curl -Ss "http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
* ...
* `curl -Ss "http://opentsdb:4242/api/query?start=2160h-ago&end=2159h-ago&m=sum:1m-avg-none:system.load5\{host=host1\}"`
This means that we must stream data from OpenTSDB to VictoriaMetrics in chunks. This is where concurrency for OpenTSDB comes in. We can query multiple chunks at once, but we shouldn't perform too many chunks at a time to avoid overloading the OpenTSDB cluster.
```
$ bin/vmctl opentsdb --otsdb-addr http://opentsdb:4242/ --otsdb-retentions sum-1m-avg:1h:1d --otsdb-filters system --otsdb-normalize --vm-addr http://victoria/
OpenTSDB import mode
2021/04/09 11:52:50 Will collect data starting at TS 1617990770
2021/04/09 11:52:50 Loading all metrics from OpenTSDB for filters: [system]
Found 9 metrics to import. Continue? [Y/n]
2021/04/09 11:52:51 Starting work on system.load1
23 / 402200 [>____________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________] 0.01% 2 p/s
```
### Retention strings
Starting with a relatively simple retention string (`sum-1m-avg:1h:30d`), let's describe how this is converted into actual queries.
There are two essential parts of a retention string:
1. [aggregation](#aggregation)
2. [windows/time ranges](#windows)
#### Aggregation
Retention strings essentially define the two levels of aggregation for our collected series.
`sum-1m-avg` would become:
* First order: `sum`
* Second order: `1m-avg-none`
##### First Order Aggregations
First-order aggregation addresses how to aggregate any un-mentioned tags.
This is, conceptually, directly opposite to how PromQL deals with tags. In OpenTSDB, if a tag isn't explicitly mentioned, all values assocaited with that tag will be aggregated.
It is recommended to use `sum` for the first aggregation because it is relatively quick and should not cause any changes to the incoming data (because we collect each individual series).
##### Second Order Aggregations
Second-order aggregation (`1m-avg` in our example) defines any windowing that should occur before returning the data
It is recommended to match the stat collection interval so we again avoid transforming incoming data.
We do not allow for defining the "null value" portion of the rollup window (e.g. in the aggreagtion, `1m-avg-none`, the user cannot change `none`), as the goal of this tool is to avoid modifying incoming data.
#### Windows
There are two important windows we define in a retention string:
1. the "chunk" range of each query
2. The time range we will be querying on with that "chunk"
From our example, our windows are `1h:30d`.
##### Window "chunks"
The window `1h` means that each individual query to OpenTSDB should only span 1 hour of time (e.g. `start=2h-ago&end=1h-ago`).
It is important to ensure this window somewhat matches the row size in HBase to help improve query times.
For example, if the query is hitting a rollup table with a 4 hour row size, we should set a chunk size of a multiple of 4 hours (e.g. `4h`, `8h`, etc.) to avoid requesting data across row boundaries. Landing on row boundaries allows for more consistent request times to HBase.
The default table created in HBase for OpenTSDB has a 1 hour row size, so if you aren't sure on a correct row size to use, `1h` is a reasonable choice.
##### Time range
The time range `30d` simply means we are asking for the last 30 days of data. This time range can be written using `h`, `d`, `w`, or `y`. (We can't use `m` for month because it already means `minute` in time parsing).
#### Results of retention string
The resultant queries that will be created, based on our example retention string of `sum-1m-avg:1h:30d` look like this:
```
http://opentsdb:4242/api/query?start=1h-ago&end=now&m=sum:1m-avg-none:<series>
http://opentsdb:4242/api/query?start=2h-ago&end=1h-ago&m=sum:1m-avg-none:<series>
http://opentsdb:4242/api/query?start=3h-ago&end=2h-ago&m=sum:1m-avg-none:<series>
...
http://opentsdb:4242/api/query?start=721h-ago&end=720h-ago&m=sum:1m-avg-none:<series>
```
Chunking the data like this means each individual query returns faster, so we can start populating data into VictoriaMetrics quicker.
### Restarting OpenTSDB migrations
One important note for OpenTSDB migration: Queries/HBase scans can "get stuck" within OpenTSDB itself. This can cause instability and performance issues within an OpenTSDB cluster, so stopping the migrator to deal with it may be necessary. Because of this, we provide the timstamp we started collecting data from at thebeginning of the run. You can stop and restart the importer using this "hard timestamp" to ensure you collect data from the same time range over multiple runs.
## Migrating data from InfluxDB (1.x)
@@ -96,7 +175,7 @@ See `./vmctl influx --help` for details and full list of flags.
To use migration tool please specify the InfluxDB address `--influx-addr`, the database `--influx-database` and VictoriaMetrics address `--vm-addr`.
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
@@ -191,14 +270,14 @@ You may find useful a 3rd party solution for this - https://github.com/jonppe/in
## Migrating data from Prometheus
`vmctl` supports the `prometheus` mode for migrating data from Prometheus to VictoriaMetrics time-series database.
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
See `./vmctl prometheus --help` for details and full list of flags.
See `./vmctl prometheus --help` for details and full list of flags. Also see Prometheus related articles [here](#articles).
To use migration tool please specify the path to Prometheus snapshot `--prom-snapshot` and VictoriaMetrics address `--vm-addr`.
More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data).
To use migration tool please specify the file path to Prometheus snapshot `--prom-snapshot` (see how to make a snapshot [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data)) and VictoriaMetrics address `--vm-addr`.
Please note, that `vmctl` *do not make a snapshot from Prometheus*, it uses an already prepared snapshot. More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data) and [here](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043).
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
is equal to `--httpListenAddr` flag of vminsert component. Please note, that vmctl performs initial readiness check for the given address
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
@@ -359,7 +438,7 @@ then import it into VM using `vmctl` in `prometheus` mode.
### Native protocol
The [native binary protocol](https://victoriametrics.github.io/#how-to-export-data-in-native-format)
The [native binary protocol](https://docs.victoriametrics.com/#how-to-export-data-in-native-format)
was introduced in [1.42.0 release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.42.0)
and provides the most efficient way to migrate data between VM instances: single to single, cluster to cluster,
single to cluster and vice versa. Please note that both instances (source and destination) should be of v1.42.0

View File

@@ -39,7 +39,7 @@ var (
Name: vmAddr,
Value: "http://localhost:8428",
Usage: "VictoriaMetrics address to perform import requests. \n" +
"Should be the same as --httpListenAddr value for single-node version or VMInsert component. \n" +
"Should be the same as --httpListenAddr value for single-node version or vminsert component. \n" +
"Please note, that `vmctl` performs initial readiness check for the given address by checking `/health` endpoint.",
},
&cli.StringFlag{
@@ -96,6 +96,78 @@ var (
}
)
const (
otsdbAddr = "otsdb-addr"
otsdbConcurrency = "otsdb-concurrency"
otsdbQueryLimit = "otsdb-query-limit"
otsdbOffsetDays = "otsdb-offset-days"
otsdbHardTSStart = "otsdb-hard-ts-start"
otsdbRetentions = "otsdb-retentions"
otsdbFilters = "otsdb-filters"
otsdbNormalize = "otsdb-normalize"
otsdbMsecsTime = "otsdb-msecstime"
)
var (
otsdbFlags = []cli.Flag{
&cli.StringFlag{
Name: otsdbAddr,
Value: "http://localhost:4242",
Required: true,
Usage: "OpenTSDB server addr",
},
&cli.IntFlag{
Name: otsdbConcurrency,
Usage: "Number of concurrently running fetch queries to OpenTSDB per metric",
Value: 1,
},
&cli.StringSliceFlag{
Name: otsdbRetentions,
Value: nil,
Required: true,
Usage: "Retentions patterns to collect on. Each pattern should describe the aggregation performed " +
"for the query, the row size (in HBase) that will define how long each individual query is, " +
"and the time range to query for. e.g. sum-1m-avg:1h:3d. " +
"The first time range defined should be a multiple of the row size in HBase. " +
"e.g. if the row size is 2 hours, 4h is good, 5h less so. We want each query to land on unique rows.",
},
&cli.StringSliceFlag{
Name: otsdbFilters,
Value: cli.NewStringSlice("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"),
Usage: "Filters to process for discovering metrics in OpenTSDB",
},
&cli.Int64Flag{
Name: otsdbOffsetDays,
Usage: "Days to offset our 'starting' point for collecting data from OpenTSDB",
Value: 0,
},
&cli.Int64Flag{
Name: otsdbHardTSStart,
Usage: "A specific timestamp to start from, will override using an offset",
Value: 0,
},
/*
because the defaults are set *extremely* low in OpenTSDB (10-25 results), we will
set a larger default limit, but still allow a user to increase/decrease it
*/
&cli.IntFlag{
Name: otsdbQueryLimit,
Usage: "Result limit on meta queries to OpenTSDB (affects both metric name and tag value queries, recommended to use a value exceeding your largest series)",
Value: 100e3,
},
&cli.BoolFlag{
Name: otsdbMsecsTime,
Value: false,
Usage: "Whether OpenTSDB is writing values in milliseconds or seconds",
},
&cli.BoolFlag{
Name: otsdbNormalize,
Value: false,
Usage: "Whether to normalize all data received to lower case before forwarding to VictoriaMetrics",
},
}
)
const (
influxAddr = "influx-addr"
influxUser = "influx-user"
@@ -243,8 +315,8 @@ var (
&cli.StringFlag{
Name: vmNativeSrcAddr,
Usage: "VictoriaMetrics address to perform export from. \n" +
" Should be the same as --httpListenAddr value for single-node version or VMSelect component." +
" If exporting from cluster version - include the tenet token in address.",
" Should be the same as --httpListenAddr value for single-node version or vmselect component." +
" If exporting from cluster version see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format",
Required: true,
},
&cli.StringFlag{
@@ -260,8 +332,8 @@ var (
&cli.StringFlag{
Name: vmNativeDstAddr,
Usage: "VictoriaMetrics address to perform import to. \n" +
" Should be the same as --httpListenAddr value for single-node version or VMInsert component." +
" If importing into cluster version - include the tenet token in address.",
" Should be the same as --httpListenAddr value for single-node version or vminsert component." +
" If importing into cluster version see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format",
Required: true,
},
&cli.StringFlag{

View File

@@ -61,7 +61,7 @@ func (s Series) fetchQuery(timeFilter string) string {
}
for i, pair := range s.LabelPairs {
pairV := valueEscaper.Replace(pair.Value)
fmt.Fprintf(f, " %q='%s'", pair.Name, pairV)
fmt.Fprintf(f, " %q::tag='%s'", pair.Name, pairV)
if i != len(s.LabelPairs)-1 {
f.WriteString(" and")
}

View File

@@ -19,7 +19,7 @@ func TestFetchQuery(t *testing.T) {
},
},
},
expected: `select "value" from "cpu" where "foo"='bar'`,
expected: `select "value" from "cpu" where "foo"::tag='bar'`,
},
{
s: Series{
@@ -36,7 +36,7 @@ func TestFetchQuery(t *testing.T) {
},
},
},
expected: `select "value" from "cpu" where "foo"='bar' and "baz"='qux'`,
expected: `select "value" from "cpu" where "foo"::tag='bar' and "baz"::tag='qux'`,
},
{
s: Series{
@@ -50,7 +50,7 @@ func TestFetchQuery(t *testing.T) {
},
},
timeFilter: "time >= now()",
expected: `select "value" from "cpu" where "foo"='b\'ar' and time >= now()`,
expected: `select "value" from "cpu" where "foo"::tag='b\'ar' and time >= now()`,
},
{
s: Series{
@@ -68,7 +68,7 @@ func TestFetchQuery(t *testing.T) {
},
},
timeFilter: "time >= now()",
expected: `select "value" from "cpu" where "name"='dev-mapper-centos\\x2dswap.swap' and "state"='dev-mapp\'er-c\'en\'tos' and time >= now()`,
expected: `select "value" from "cpu" where "name"::tag='dev-mapper-centos\\x2dswap.swap' and "state"::tag='dev-mapp\'er-c\'en\'tos' and time >= now()`,
},
{
s: Series{

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
@@ -20,9 +21,41 @@ func main() {
start := time.Now()
app := &cli.App{
Name: "vmctl",
Usage: "Victoria metrics command-line tool",
Usage: "VictoriaMetrics command-line tool",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "opentsdb",
Usage: "Migrate timeseries from OpenTSDB",
Flags: mergeFlags(globalFlags, otsdbFlags, vmFlags),
Action: func(c *cli.Context) error {
fmt.Println("OpenTSDB import mode")
oCfg := opentsdb.Config{
Addr: c.String(otsdbAddr),
Limit: c.Int(otsdbQueryLimit),
Offset: c.Int64(otsdbOffsetDays),
HardTS: c.Int64(otsdbHardTSStart),
Retentions: c.StringSlice(otsdbRetentions),
Filters: c.StringSlice(otsdbFilters),
Normalize: c.Bool(otsdbNormalize),
MsecsTime: c.Bool(otsdbMsecsTime),
}
otsdbClient, err := opentsdb.NewClient(oCfg)
if err != nil {
return fmt.Errorf("failed to create opentsdb client: %s", err)
}
vmCfg := initConfigVM(c)
importer, err := vm.NewImporter(vmCfg)
if err != nil {
return fmt.Errorf("failed to create VM importer: %s", err)
}
otsdbProcessor := newOtsdbProcessor(otsdbClient, importer, c.Int(otsdbConcurrency))
return otsdbProcessor.run(c.Bool(globalSilent))
},
},
{
Name: "influx",
Usage: "Migrate timeseries from InfluxDB",

164
app/vmctl/opentsdb.go Normal file
View File

@@ -0,0 +1,164 @@
package main
import (
"fmt"
"log"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/cheggaaa/pb/v3"
)
type otsdbProcessor struct {
oc *opentsdb.Client
im *vm.Importer
otsdbcc int
}
type queryObj struct {
Series opentsdb.Meta
Rt opentsdb.RetentionMeta
Tr opentsdb.TimeRange
StartTime int64
}
func newOtsdbProcessor(oc *opentsdb.Client, im *vm.Importer, otsdbcc int) *otsdbProcessor {
if otsdbcc < 1 {
otsdbcc = 1
}
return &otsdbProcessor{
oc: oc,
im: im,
otsdbcc: otsdbcc,
}
}
func (op *otsdbProcessor) run(silent bool) error {
log.Println("Loading all metrics from OpenTSDB for filters: ", op.oc.Filters)
var metrics []string
for _, filter := range op.oc.Filters {
q := fmt.Sprintf("%s/api/suggest?type=metrics&q=%s&max=%d", op.oc.Addr, filter, op.oc.Limit)
m, err := op.oc.FindMetrics(q)
if err != nil {
return fmt.Errorf("metric discovery failed for %q: %s", q, err)
}
metrics = append(metrics, m...)
}
if len(metrics) < 1 {
return fmt.Errorf("found no timeseries to import with filters %q", op.oc.Filters)
}
question := fmt.Sprintf("Found %d metrics to import. Continue?", len(metrics))
if !silent && !prompt(question) {
return nil
}
op.im.ResetStats()
var startTime int64
if op.oc.HardTS != 0 {
startTime = op.oc.HardTS
} else {
startTime = time.Now().Unix()
}
queryRanges := 0
// pre-calculate the number of query ranges we'll be processing
for _, rt := range op.oc.Retentions {
queryRanges += len(rt.QueryRanges)
}
for _, metric := range metrics {
log.Println(fmt.Sprintf("Starting work on %s", metric))
serieslist, err := op.oc.FindSeries(metric)
if err != nil {
return fmt.Errorf("couldn't retrieve series list for %s : %s", metric, err)
}
/*
Create channels for collecting/processing series and errors
We'll create them per metric to reduce pressure against OpenTSDB
Limit the size of seriesCh so we can't get too far ahead of actual processing
*/
seriesCh := make(chan queryObj, op.otsdbcc)
errCh := make(chan error)
// we're going to make serieslist * queryRanges queries, so we should represent that in the progress bar
bar := pb.StartNew(len(serieslist) * queryRanges)
var wg sync.WaitGroup
wg.Add(op.otsdbcc)
for i := 0; i < op.otsdbcc; i++ {
go func() {
defer wg.Done()
for s := range seriesCh {
if err := op.do(s); err != nil {
errCh <- fmt.Errorf("couldn't retrieve series for %s : %s", metric, err)
return
}
bar.Increment()
}
}()
}
/*
Loop through all series for this metric, processing all retentions and time ranges
requested. This loop is our primary "collect data from OpenTSDB loop" and should
be async, sending data to VictoriaMetrics over time.
The idea with having the select at the inner-most loop is to ensure quick
short-circuiting on error.
*/
for _, series := range serieslist {
for _, rt := range op.oc.Retentions {
for _, tr := range rt.QueryRanges {
select {
case otsdbErr := <-errCh:
return fmt.Errorf("opentsdb error: %s", otsdbErr)
case vmErr := <-op.im.Errors():
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
case seriesCh <- queryObj{
Tr: tr, StartTime: startTime,
Series: series, Rt: opentsdb.RetentionMeta{
FirstOrder: rt.FirstOrder, SecondOrder: rt.SecondOrder, AggTime: rt.AggTime}}:
}
}
}
}
// Drain channels per metric
close(seriesCh)
wg.Wait()
close(errCh)
// check for any lingering errors on the query side
for otsdbErr := range errCh {
return fmt.Errorf("Import process failed: \n%s", otsdbErr)
}
bar.Finish()
log.Print(op.im.Stats())
}
op.im.Close()
for vmErr := range op.im.Errors() {
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
}
log.Println("Import finished!")
log.Print(op.im.Stats())
return nil
}
func (op *otsdbProcessor) do(s queryObj) error {
start := s.StartTime - s.Tr.Start
end := s.StartTime - s.Tr.End
data, err := op.oc.GetData(s.Series, s.Rt, start, end)
if err != nil {
return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
}
if len(data.Timestamps) < 1 || len(data.Values) < 1 {
return nil
}
labels := make([]vm.LabelPair, len(data.Tags))
for k, v := range data.Tags {
labels = append(labels, vm.LabelPair{Name: k, Value: v})
}
op.im.Input() <- &vm.TimeSeries{
Name: data.Metric,
LabelPairs: labels,
Timestamps: data.Timestamps,
Values: data.Values,
}
return nil
}

View File

@@ -0,0 +1,349 @@
package opentsdb
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"strings"
"time"
)
// Retention objects contain meta data about what to query for our run
type Retention struct {
/*
OpenTSDB has two levels of aggregation,
First, we aggregate any un-mentioned tags into the last result
Second, we aggregate into buckets over time
To simulate this with config, we have
FirstOrder (e.g. sum/avg/max/etc.)
SecondOrder (e.g. sum/avg/max/etc.)
AggTime (e.g. 1m/10m/1d/etc.)
This will build into m=<FirstOrder>:<AggTime>-<SecondOrder>-none:
Or an example: m=sum:1m-avg-none
*/
FirstOrder string
SecondOrder string
AggTime string
// The actual ranges will will attempt to query (as offsets from now)
QueryRanges []TimeRange
}
// RetentionMeta objects exist to pass smaller subsets (only one retention range) of a full Retention object around
type RetentionMeta struct {
FirstOrder string
SecondOrder string
AggTime string
}
// Client object holds general config about how queries should be performed
type Client struct {
Addr string
// The meta query limit for series returned
Limit int
Retentions []Retention
Filters []string
Normalize bool
HardTS int64
}
// Config contains fields required
// for Client configuration
type Config struct {
Addr string
Limit int
Offset int64
HardTS int64
Retentions []string
Filters []string
Normalize bool
MsecsTime bool
}
// TimeRange contains data about time ranges to query
type TimeRange struct {
Start int64
End int64
}
// MetaResults contains return data from search series lookup queries
type MetaResults struct {
Type string `json:"type"`
Results []Meta `json:"results"`
//metric string
//tags interface{}
//limit int
//time int
//startIndex int
//totalResults int
}
// Meta A meta object about a metric
// only contain the tags/etc. and no data
type Meta struct {
//tsuid string
Metric string `json:"metric"`
Tags map[string]string `json:"tags"`
}
// Metric holds the time series data
type Metric struct {
Metric string
Tags map[string]string
Timestamps []int64
Values []float64
}
// ExpressionOutput contains results from actual data queries
type ExpressionOutput struct {
Outputs []qoObj `json:"outputs"`
Query interface{} `json:"query"`
}
// QoObj contains actual timeseries data from the returned data query
type qoObj struct {
ID string `json:"id"`
Alias string `json:"alias"`
Dps [][]float64 `json:"dps"`
//dpsMeta interface{}
//meta interface{}
}
// Expression objects format our data queries
/*
All of the following structs are to build a OpenTSDB expression object
*/
type Expression struct {
Time timeObj `json:"time"`
Filters []filterObj `json:"filters"`
Metrics []metricObj `json:"metrics"`
// this just needs to be an empty object, so the value doesn't matter
Expressions []int `json:"expressions"`
Outputs []outputObj `json:"outputs"`
}
type timeObj struct {
Start int64 `json:"start"`
End int64 `json:"end"`
Aggregator string `json:"aggregator"`
Downsampler dSObj `json:"downsampler"`
}
type dSObj struct {
Interval string `json:"interval"`
Aggregator string `json:"aggregator"`
FillPolicy fillObj `json:"fillPolicy"`
}
type fillObj struct {
// we'll always hard-code to NaN here, so we don't need value
Policy string `json:"policy"`
}
type filterObj struct {
Tags []tagObj `json:"tags"`
ID string `json:"id"`
}
type tagObj struct {
Type string `json:"type"`
Tagk string `json:"tagk"`
Filter string `json:"filter"`
GroupBy bool `json:"groupBy"`
}
type metricObj struct {
ID string `json:"id"`
Metric string `json:"metric"`
Filter string `json:"filter"`
FillPolicy fillObj `json:"fillPolicy"`
}
type outputObj struct {
ID string `json:"id"`
Alias string `json:"alias"`
}
/* End expression object structs */
var (
exprOutput = outputObj{ID: "a", Alias: "query"}
exprFillPolicy = fillObj{Policy: "nan"}
)
// FindMetrics discovers all metrics that OpenTSDB knows about (given a filter)
// e.g. /api/suggest?type=metrics&q=system&max=100000
func (c Client) FindMetrics(q string) ([]string, error) {
resp, err := http.Get(q)
if err != nil {
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
}
defer func() { _ = resp.Body.Close() }()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("could not retrieve metric data from %q: %s", q, err)
}
var metriclist []string
err = json.Unmarshal(body, &metriclist)
if err != nil {
return nil, fmt.Errorf("failed to read response from %q: %s", q, err)
}
return metriclist, nil
}
// FindSeries discovers all series associated with a metric
// e.g. /api/search/lookup?m=system.load5&limit=1000000
func (c Client) FindSeries(metric string) ([]Meta, error) {
q := fmt.Sprintf("%s/api/search/lookup?m=%s&limit=%d", c.Addr, metric, c.Limit)
resp, err := http.Get(q)
if err != nil {
return nil, fmt.Errorf("failed to set GET request to %q: %s", q, err)
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
}
defer func() { _ = resp.Body.Close() }()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
}
var results MetaResults
err = json.Unmarshal(body, &results)
if err != nil {
return nil, fmt.Errorf("failed to read response from %q: %s", q, err)
}
return results.Results, nil
}
// GetData actually retrieves data for a series at a specified time range
func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64) (Metric, error) {
/*
Here we build the actual exp query we'll send to OpenTSDB
This is comprised of a number of different settings. We hard-code
a few to simplify the JSON object creation.
There are examples queries available, so not too much detail here...
*/
expr := Expression{}
expr.Outputs = []outputObj{exprOutput}
expr.Metrics = append(expr.Metrics, metricObj{ID: "a", Metric: series.Metric,
Filter: "f1", FillPolicy: exprFillPolicy})
expr.Time = timeObj{Start: start, End: end, Aggregator: rt.FirstOrder,
Downsampler: dSObj{Interval: rt.AggTime,
Aggregator: rt.SecondOrder,
FillPolicy: exprFillPolicy}}
var TagList []tagObj
for k, v := range series.Tags {
/*
every tag should be a literal_or because that's the closest to a full "==" that
this endpoint allows for
*/
TagList = append(TagList, tagObj{Type: "literal_or", Tagk: k,
Filter: v, GroupBy: true})
}
expr.Filters = append(expr.Filters, filterObj{ID: "f1", Tags: TagList})
// "expressions" is required in the query object or we get a 5xx, so force it to exist
expr.Expressions = make([]int, 0)
inputData, err := json.Marshal(expr)
if err != nil {
return Metric{}, fmt.Errorf("failed to marshal query JSON %s", err)
}
q := fmt.Sprintf("%s/api/query/exp", c.Addr)
resp, err := http.Post(q, "application/json", bytes.NewBuffer(inputData))
if err != nil {
return Metric{}, fmt.Errorf("failed to send GET request to %q: %s", q, err)
}
if resp.StatusCode != 200 {
return Metric{}, fmt.Errorf("Bad return from OpenTSDB: %q: %v", resp.StatusCode, resp)
}
defer func() { _ = resp.Body.Close() }()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return Metric{}, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
}
var output ExpressionOutput
err = json.Unmarshal(body, &output)
if err != nil {
return Metric{}, fmt.Errorf("failed to unmarshal response from %q: %s", q, err)
}
if len(output.Outputs) < 1 {
// no results returned...return an empty object without error
return Metric{}, nil
}
data := Metric{}
data.Metric = series.Metric
data.Tags = series.Tags
/*
We evaluate data for correctness before formatting the actual values
to skip a little bit of time if the series has invalid formatting
First step is to enforce Prometheus' data model
*/
data, err = modifyData(data, c.Normalize)
if err != nil {
return Metric{}, fmt.Errorf("invalid series data from %q: %s", q, err)
}
/*
Convert data from OpenTSDB's output format ([[ts,val],[ts,val]...])
to VictoriaMetrics format: {"timestamps": [ts,ts,ts...], "values": [val,val,val...]}
The nasty part here is that because an object in each array
can be a float64, we have to initially cast _all_ objects that way
then convert the timestamp back to something reasonable.
*/
for _, tsobj := range output.Outputs[0].Dps {
data.Timestamps = append(data.Timestamps, int64(tsobj[0]))
data.Values = append(data.Values, tsobj[1])
}
return data, nil
}
// NewClient creates and returns OpenTSDB client
// configured with passed Config
func NewClient(cfg Config) (*Client, error) {
var retentions []Retention
offsetPrint := int64(time.Now().Unix())
if cfg.MsecsTime {
// 1000000 == Nanoseconds -> Milliseconds difference
offsetPrint = int64(time.Now().UnixNano() / 1000000)
}
if cfg.HardTS > 0 {
/*
HardTS is a specific timestamp we'll be starting at.
Just present that if it is defined
*/
offsetPrint = cfg.HardTS
} else if cfg.Offset > 0 {
/*
Our "offset" is the number of days we should step
back before starting to scan for data
*/
if cfg.MsecsTime {
offsetPrint = offsetPrint - (cfg.Offset * 24 * 60 * 60 * 1000)
} else {
offsetPrint = offsetPrint - (cfg.Offset * 24 * 60 * 60)
}
}
log.Println(fmt.Sprintf("Will collect data starting at TS %v", offsetPrint))
for _, r := range cfg.Retentions {
ret, err := convertRetention(r, cfg.Offset, cfg.MsecsTime)
if err != nil {
return &Client{}, fmt.Errorf("Couldn't parse retention %q :: %v", r, err)
}
retentions = append(retentions, ret)
}
client := &Client{
Addr: strings.Trim(cfg.Addr, "/"),
Retentions: retentions,
Limit: cfg.Limit,
Filters: cfg.Filters,
Normalize: cfg.Normalize,
HardTS: cfg.HardTS,
}
return client, nil
}

View File

@@ -0,0 +1,173 @@
package opentsdb
import (
"fmt"
"regexp"
"strconv"
"strings"
"time"
)
var (
allowedNames = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9_:]*$")
allowedFirstChar = regexp.MustCompile("^[a-zA-Z]")
replaceChars = regexp.MustCompile("[^a-zA-Z0-9_:]")
allowedTagKeys = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9_]*$")
)
func convertDuration(duration string) (time.Duration, error) {
/*
Golang's time library doesn't support many different
string formats (year, month, week, day) because they
aren't consistent ranges. But Java's library _does_.
Consequently, we'll need to handle all the custom
time ranges, and, to make the internal API call consistent,
we'll need to allow for durations that Go supports, too.
The nice thing is all the "broken" time ranges are > 1 hour,
so we can just make assumptions to convert them to a range in hours.
They aren't *good* assumptions, but they're reasonable
for this function.
*/
var actualDuration time.Duration
var err error
var timeValue int
if strings.HasSuffix(duration, "y") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "y"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
timeValue = timeValue * 365 * 24
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else if strings.HasSuffix(duration, "w") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "w"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
timeValue = timeValue * 7 * 24
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else if strings.HasSuffix(duration, "d") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "d"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
timeValue = timeValue * 24
actualDuration, err = time.ParseDuration(fmt.Sprintf("%vh", timeValue))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else if strings.HasSuffix(duration, "h") || strings.HasSuffix(duration, "m") || strings.HasSuffix(duration, "s") || strings.HasSuffix(duration, "ms") {
actualDuration, err = time.ParseDuration(duration)
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else {
return 0, fmt.Errorf("invalid time duration string: %q", duration)
}
return actualDuration, nil
}
// Convert an incoming retention "string" into the component parts
func convertRetention(retention string, offset int64, msecTime bool) (Retention, error) {
/*
A retention string coming in looks like
sum-1m-avg:1h:30d
So we:
1. split on the :
2. split on the - in slice 0
3. create the time ranges we actually need
*/
chunks := strings.Split(retention, ":")
if len(chunks) != 3 {
return Retention{}, fmt.Errorf("invalid retention string: %q", retention)
}
rowLengthDuration, err := convertDuration(chunks[1])
if err != nil {
return Retention{}, fmt.Errorf("invalid row length (first order) duration string: %q: %s", chunks[1], err)
}
// set length of each row in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
rowLength := rowLengthDuration.Milliseconds()
if !msecTime {
rowLength = rowLength / 1000
}
ttlDuration, err := convertDuration(chunks[2])
if err != nil {
return Retention{}, fmt.Errorf("invalid ttl (second order) duration string: %q: %s", chunks[2], err)
}
// set ttl in milliseconds, unless we aren't using millisecond time in OpenTSDB...then use seconds
ttl := ttlDuration.Milliseconds()
if !msecTime {
ttl = ttl / 1000
}
// bump by the offset so we don't look at empty ranges any time offset > ttl
ttl += offset
var timeChunks []TimeRange
var i int64
for i = offset; i <= ttl; i = i + rowLength {
timeChunks = append(timeChunks, TimeRange{Start: i + rowLength, End: i})
}
// first/second order aggregations for queries defined in chunk 0...
aggregates := strings.Split(chunks[0], "-")
if len(aggregates) != 3 {
return Retention{}, fmt.Errorf("invalid aggregation string: %q", chunks[0])
}
ret := Retention{FirstOrder: aggregates[0],
SecondOrder: aggregates[2],
AggTime: aggregates[1],
QueryRanges: timeChunks}
return ret, nil
}
// This ensures any incoming data from OpenTSDB matches the Prometheus data model
// https://prometheus.io/docs/concepts/data_model
func modifyData(msg Metric, normalize bool) (Metric, error) {
finalMsg := Metric{
Metric: "", Tags: make(map[string]string),
Timestamps: msg.Timestamps, Values: msg.Values,
}
// if the metric name has invalid characters, the data model says to drop it
if !allowedFirstChar.MatchString(msg.Metric) {
return Metric{}, fmt.Errorf("%s has a bad first character", msg.Metric)
}
name := msg.Metric
// if normalization requested, lowercase the name
if normalize {
name = strings.ToLower(name)
}
/*
replace bad characters in metric name with _ per the data model
only replace if needed to reduce string processing time
*/
if !allowedNames.MatchString(name) {
finalMsg.Metric = replaceChars.ReplaceAllString(name, "_")
} else {
finalMsg.Metric = name
}
// replace bad characters in tag keys with _ per the data model
for key, value := range msg.Tags {
// if normalization requested, lowercase the key and value
if normalize {
key = strings.ToLower(key)
value = strings.ToLower(value)
}
/*
replace all explicitly bad characters with _
only replace if needed to reduce string processing time
*/
if !allowedTagKeys.MatchString(key) {
key = replaceChars.ReplaceAllString(key, "_")
}
// tags that start with __ are considered custom stats for internal prometheus stuff, we should drop them
if !strings.HasPrefix(key, "__") {
finalMsg.Tags[key] = value
}
}
return finalMsg, nil
}

View File

@@ -0,0 +1,217 @@
package opentsdb
import (
"testing"
)
func TestConvertRetention(t *testing.T) {
/*
2592000 seconds in 30 days
3600 in one hour
2592000 / 3600 = 720 individual query "ranges" should exist, plus one because time ranges can be weird
First order should == "sum"
Second order should == "avg"
AggTime should == "1m"
*/
res, err := convertRetention("sum-1m-avg:1h:30d", 0, false)
if err != nil {
t.Fatalf("Error parsing valid retention string: %v", err)
}
if len(res.QueryRanges) != 721 {
t.Fatalf("Found %v query ranges. Should have found 720", len(res.QueryRanges))
}
if res.FirstOrder != "sum" {
t.Fatalf("Incorrect first order aggregation %q. Should have been 'sum'", res.FirstOrder)
}
if res.SecondOrder != "avg" {
t.Fatalf("Incorrect second order aggregation %q. Should have been 'avg'", res.SecondOrder)
}
if res.AggTime != "1m" {
t.Fatalf("Incorrect aggregation time length %q. Should have been '1m'", res.AggTime)
}
/*
Invalid retention string
*/
res, err = convertRetention("sum-1m-avg:30d", 0, false)
if err == nil {
t.Fatalf("Bad retention string (sum-1m-avg:30d) didn't fail: %v", res)
}
/*
Invalid aggregation string
*/
res, err = convertRetention("sum-1m:1h:30d", 0, false)
if err == nil {
t.Fatalf("Bad aggregation string (sum-1m:1h:30d) didn't fail: %v", res)
}
}
func TestModifyData(t *testing.T) {
/*
Good metric metadata
*/
m := Metric{
Metric: "cpu",
Tags: map[string]string{
"core": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err := modifyData(m, false)
if err != nil {
t.Fatalf("Valid metric %v failed to parse: %v", m, err)
}
if res.Metric != "cpu" {
t.Fatalf("Valid metric name %q was converted: %q", m.Metric, res.Metric)
}
found := false
for k := range res.Tags {
if k == "core" {
found = true
break
}
}
if !found {
t.Fatalf("Valid metric tag name 'core' missing: %v", res.Tags)
}
/*
Bad first character in metric name
metric names cannot start with _, so this
metric should fail entirely
*/
m = Metric{
Metric: "_cpu",
Tags: map[string]string{
"core": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err = modifyData(m, false)
if err == nil {
t.Fatalf("Invalid metric %v parsed?", m)
}
/*
Bad character in metric name
metric names cannot have `.`, so this
should be converted to `_`
*/
m = Metric{
Metric: "cpu.name",
Tags: map[string]string{
"core": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err = modifyData(m, false)
if err != nil {
t.Fatalf("Valid metric failed to parse? %v", err)
}
if res.Metric != "cpu_name" {
t.Fatalf("Metric name not correctly converted from 'cpu.name' to 'cpu_name': %q", res.Metric)
}
/*
bad tag prefix (__)
Prometheus considers tags beginning with __
to be internal use only. They should not show up in incoming data.
this tag should be dropped from the result
*/
m = Metric{
Metric: "cpu",
Tags: map[string]string{
"__core": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err = modifyData(m, false)
if err != nil {
t.Fatalf("Valid metric failed to parse? %v", err)
}
found = false
for k := range res.Tags {
if k == "__core" {
found = true
break
}
}
if found {
t.Fatalf("Bad tag key prefix (__) found")
}
/*
bad tag key
tag keys cannot contain `.`, this should be
replaced with `_`
*/
m = Metric{
Metric: "cpu",
Tags: map[string]string{
"core.name": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err = modifyData(m, false)
if err != nil {
t.Fatalf("Valid metric failed to parse? %v", err)
}
found = false
for k := range res.Tags {
if k == "core.name" {
found = true
break
}
}
if found {
t.Fatalf("Bad tag key 'core.name' not converted")
}
/*
test normalize
All characters should be returned lowercase
*/
m = Metric{
Metric: "CPU",
Tags: map[string]string{
"core": "0",
},
Values: []float64{
0,
},
Timestamps: []int64{
0,
},
}
res, err = modifyData(m, true)
if err != nil {
t.Fatalf("Valid metric failed to parse? %v", err)
}
if res.Metric != "cpu" {
t.Fatalf("Normalization of metric name didn't happen!")
}
}

View File

@@ -0,0 +1,398 @@
{
"outputs": [
{
"id": "a",
"alias": "query",
"dps": [
[
1614099600000,
0.28
],
[
1614099660000,
0.22
],
[
1614099720000,
0.18
],
[
1614099780000,
0.14
],
[
1614099840000,
0.24
],
[
1614099900000,
0.19
],
[
1614099960000,
0.22
],
[
1614100020000,
0.2
],
[
1614100080000,
0.18
],
[
1614100140000,
0.22
],
[
1614100200000,
0.17
],
[
1614100260000,
0.16
],
[
1614100320000,
0.22
],
[
1614100380000,
0.3
],
[
1614100440000,
0.28
],
[
1614100500000,
0.27
],
[
1614100560000,
0.26
],
[
1614100620000,
0.23
],
[
1614100680000,
0.18
],
[
1614100740000,
0.3
],
[
1614100800000,
0.24
],
[
1614100860000,
0.19
],
[
1614100920000,
0.16
],
[
1614100980000,
0.19
],
[
1614101040000,
0.23
],
[
1614101100000,
0.18
],
[
1614101160000,
0.15
],
[
1614101220000,
0.12
],
[
1614101280000,
0.1
],
[
1614101340000,
0.24
],
[
1614101400000,
0.19
],
[
1614101460000,
0.16
],
[
1614101520000,
0.14
],
[
1614101580000,
0.12
],
[
1614101640000,
0.14
],
[
1614101700000,
0.12
],
[
1614101760000,
0.13
],
[
1614101820000,
0.12
],
[
1614101880000,
0.11
],
[
1614101940000,
0.36
],
[
1614102000000,
0.35
],
[
1614102060000,
0.3
],
[
1614102120000,
0.32
],
[
1614102180000,
0.27
],
[
1614102240000,
0.26
],
[
1614102300000,
0.21
],
[
1614102360000,
0.18
],
[
1614102420000,
0.15
],
[
1614102480000,
0.12
],
[
1614102540000,
0.24
],
[
1614102600000,
0.2
],
[
1614102660000,
0.17
],
[
1614102720000,
0.18
],
[
1614102780000,
0.14
],
[
1614102840000,
0.39
],
[
1614102900000,
0.31
],
[
1614102960000,
0.3
],
[
1614103020000,
0.24
],
[
1614103080000,
0.26
],
[
1614103140000,
0.21
],
[
1614103200000,
0.17
],
[
1614103260000,
0.15
],
[
1614103320000,
0.2
],
[
1614103380000,
0.2
],
[
1614103440000,
0.22
],
[
1614103500000,
0.19
],
[
1614103560000,
0.22
],
[
1614103620000,
0.29
],
[
1614103680000,
0.31
],
[
1614103740000,
0.28
],
[
1614103800000,
0.23
]
],
"dpsMeta": {
"firstTimestamp": 1614099600000,
"lastTimestamp": 1614103800000,
"setCount": 71,
"series": 1
},
"meta": [
{
"index": 0,
"metrics": [
"timestamp"
]
},
{
"index": 1,
"metrics": [
"system.load5"
],
"commonTags": {
"rack": "undef",
"host": "use1-mon-metrics-1",
"row": "undef",
"dc": "us-east-1",
"group": "monitoring"
},
"aggregatedTags": []
}
]
}
],
"query": {
"name": null,
"time": {
"start": "1h-ago",
"end": null,
"timezone": null,
"downsampler": {
"interval": "1m",
"aggregator": "avg",
"fillPolicy": {
"policy": "nan",
"value": "NaN"
}
},
"aggregator": "sum",
"rate": false
},
"filters": [
{
"id": "f1",
"tags": [
{
"tagk": "host",
"filter": "use1-mon-metrics-1",
"group_by": true,
"type": "literal_or"
},
{
"tagk": "group",
"filter": "monitoring",
"group_by": true,
"type": "literal_or"
},
{
"tagk": "dc",
"filter": "us-east-1",
"group_by": true,
"type": "literal_or"
},
{
"tagk": "rack",
"filter": "undef",
"group_by": true,
"type": "literal_or"
},
{
"tagk": "row",
"filter": "undef",
"group_by": true,
"type": "literal_or"
}
],
"explicitTags": false
}
],
"metrics": [
{
"metric": "system.load5",
"id": "a",
"filter": "f1",
"aggregator": null,
"timeOffset": null,
"fillPolicy": {
"policy": "nan",
"value": "NaN"
}
}
],
"expressions": [],
"outputs": [
{
"id": "a",
"alias": "query"
}
]
}
}

View File

@@ -0,0 +1,62 @@
{
"time": {
"start": "1h-ago",
"aggregator":"sum",
"downsampler": {
"interval": "1m",
"aggregator": "avg",
"fillPolicy": {
"policy": "nan"
}
}
},
"filters": [
{
"tags": [
{
"type": "literal_or",
"tagk": "host",
"filter": "use1-mon-metrics-1",
"groupBy": true
},
{
"type": "literal_or",
"tagk": "group",
"filter": "monitoring",
"groupBy": true
},
{
"type": "literal_or",
"tagk": "dc",
"filter": "us-east-1",
"groupBy": true
},
{
"type": "literal_or",
"tagk": "rack",
"filter": "undef",
"groupBy": true
},
{
"type": "literal_or",
"tagk": "row",
"filter": "undef",
"groupBy": true
}
],
"id": "f1"
}
],
"metrics": [
{
"id": "a",
"metric": "system.load5",
"filter": "f1",
"fillPolicy":{"policy":"nan"}
}
],
"expressions": [],
"outputs":[
{"id":"a", "alias":"query"}
]
}

View File

@@ -56,27 +56,37 @@ func (cw *cWriter) printf(format string, args ...interface{}) {
//"{"metric":{"__name__":"cpu_usage_guest","arch":"x64","hostname":"host_19",},"timestamps":[1567296000000,1567296010000],"values":[1567296000000,66]}
func (ts *TimeSeries) write(w io.Writer) (int, error) {
pointsCount := len(ts.Timestamps)
if pointsCount == 0 {
return 0, nil
}
timestamps := ts.Timestamps
values := ts.Values
cw := &cWriter{w: w}
cw.printf(`{"metric":{"__name__":%q`, ts.Name)
if len(ts.LabelPairs) > 0 {
for len(timestamps) > 0 {
// Split long lines with more than 10K samples into multiple JSON lines.
// This should limit memory usage at VictoriaMetrics during data ingestion,
// since it allocates memory for the whole JSON line and processes it in one go.
batchSize := 10000
if batchSize > len(timestamps) {
batchSize = len(timestamps)
}
timestampsBatch := timestamps[:batchSize]
valuesBatch := values[:batchSize]
timestamps = timestamps[batchSize:]
values = values[batchSize:]
cw.printf(`{"metric":{"__name__":%q`, ts.Name)
for _, lp := range ts.LabelPairs {
cw.printf(",%q:%q", lp.Name, lp.Value)
}
}
cw.printf(`},"timestamps":[`)
for i := 0; i < pointsCount-1; i++ {
cw.printf(`%d,`, ts.Timestamps[i])
pointsCount := len(timestampsBatch)
cw.printf(`},"timestamps":[`)
for i := 0; i < pointsCount-1; i++ {
cw.printf(`%d,`, timestampsBatch[i])
}
cw.printf(`%d],"values":[`, timestampsBatch[pointsCount-1])
for i := 0; i < pointsCount-1; i++ {
cw.printf(`%v,`, valuesBatch[i])
}
cw.printf("%v]}\n", valuesBatch[pointsCount-1])
}
cw.printf(`%d],"values":[`, ts.Timestamps[pointsCount-1])
for i := 0; i < pointsCount-1; i++ {
cw.printf(`%v,`, ts.Values[i])
}
cw.printf("%v]}\n", ts.Values[pointsCount-1])
return cw.n, cw.err
}

View File

@@ -21,7 +21,7 @@ import (
type Config struct {
// VictoriaMetrics address to perform import requests
// --httpListenAddr value for single node version
// --httpListenAddr value of VMSelect component for cluster version
// --httpListenAddr value of vmselect component for cluster version
Addr string
// Concurrency defines number of worker
// performing the import requests concurrently
@@ -51,7 +51,7 @@ type Config struct {
// Importer performs insertion of timeseries
// via VictoriaMetrics import protocol
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master#how-to-import-time-series-data
// see https://docs.victoriametrics.com/#how-to-import-time-series-data
type Importer struct {
addr string
importPath string
@@ -105,11 +105,11 @@ func NewImporter(cfg Config) (*Importer, error) {
addr := strings.TrimRight(cfg.Addr, "/")
// if single version
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master#how-to-import-time-series-data
// see https://docs.victoriametrics.com/#how-to-import-time-series-data
importPath := addr + "/api/v1/import"
if cfg.AccountID != "" {
// if cluster version
// see https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster#url-format
// see https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format
importPath = fmt.Sprintf("%s/insert/%s/prometheus/api/v1/import", addr, cfg.AccountID)
}
importPath, err := AddExtraLabelsToImportPath(importPath, cfg.ExtraLabels)

289
app/vmgateway/README.md Normal file
View File

@@ -0,0 +1,289 @@
# vmgateway
***vmgateway is a part of [enterprise package](https://victoriametrics.com/enterprise.html)***
<img alt="vmgateway" src="vmgateway-overview.jpeg">
`vmgateway` is a proxy for the VictoriaMetrics Time Series Database (TSDB). It provides the following features:
* Rate Limiter
* Based on cluster tenant's utilization, it supports multiple time interval limits for both the ingestion and retrieval of metrics
* Token Access Control
* Supports additional per-label access control for both the Single and Cluster versions of the VictoriaMetrics TSDB
* Provides access by tenantID in the Cluster version
* Allows for separate write/read/admin access to data
`vmgateway` is included in our [enterprise packages](https://victoriametrics.com/enterprise.html).
## Access Control
<img alt="vmgateway-ac" src="vmgateway-access-control.jpg">
`vmgateway` supports jwt based authentication. With jwt payload can be configured to give access to specific tenants and labels as well as to read/write.
jwt token must be in following format:
```json
{
"exp": 1617304574,
"vm_access": {
"tenant_id": {
"account_id": 1,
"project_id": 5
},
"extra_labels": {
"team": "dev",
"project": "mobile"
},
"mode": 1
}
}
```
Where:
- `exp` - required, expire time in unix_timestamp. If the token expires then `vmgateway` rejects the request.
- `vm_access` - required, dict with claim info, minimum form: `{"vm_access": {"tenand_id": {}}`
- `tenant_id` - optional, for cluster mode, routes requests to the corresponding tenant.
- `extra_labels` - optional, key-value pairs for label filters added to the ingested or selected metrics.
- `mode` - optional, access mode for api - read, write, or full. Supported values: 0 - full (default value), 1 - read, 2 - write.
## QuickStart
Start the single version of VictoriaMetrics
```bash
# single
# start node
./bin/victoria-metrics --selfScrapeInterval=10s
```
Start vmgateway
```bash
./bin/vmgateway -eula -enable.auth -read.url http://localhost:8428 --write.url http://localhost:8428
```
Retrieve data from the database
```bash
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ2bV9hY2Nlc3MiOnsidGVuYW50X2lkIjp7fSwicm9sZSI6MX0sImV4cCI6MTkzOTM0NjIxMH0.5WUxEfdcV9hKo4CtQdtuZYOGpGXWwaqM9VuVivMMrVg'
```
A request with an incorrect token or without any token will be rejected:
```bash
curl 'http://localhost:8431/api/v1/series/count'
curl 'http://localhost:8431/api/v1/series/count' -H 'Authorization: Bearer incorrect-token'
```
## Rate Limiter
<img alt="vmgateway-rl" src="vmgateway-rate-limiting.jpg">
Limits incoming requests by given, pre-configured limits. It supports read and write limiting by tenant.
`vmgateway` needs a datasource for rate limit queries. It can be either single-node or cluster version of `victoria-metrics`.
The metrics that you want to rate limit must be scraped from the cluster.
List of supported limit types:
- `queries` - count of api requests made at tenant to read the api, such as `/api/v1/query`, `/api/v1/series` and others.
- `active_series` - count of current active series at any given tenant.
- `new_series` - count of created series; aka churn rate
- `rows_inserted` - count of inserted rows per tenant.
List of supported time windows:
- `minute`
- `hour`
Limits can be specified per tenant or at a global level if you omit `project_id` and `account_id`.
Example of configuration file:
```yaml
limits:
- type: queries
value: 1000
resolution: minute
- type: queries
value: 10000
resolution: hour
- type: queries
value: 10
resolution: minute
project_id: 5
account_id: 1
```
## QuickStart
cluster version of VictoriaMetrics is required for rate limiting.
```bash
# start datasource for cluster metrics
cat << EOF > cluster.yaml
scrape_configs:
- job_name: cluster
scrape_interval: 5s
static_configs:
- targets: ['127.0.0.1:8481','127.0.0.1:8482','127.0.0.1:8480']
EOF
./bin/victoria-metrics --promscrape.config cluster.yaml
# start cluster
# start vmstorage, vmselect and vminsert
./bin/vmstorage -eula
./bin/vmselect -eula -storageNode 127.0.0.1:8401
./bin/vminsert -eula -storageNode 127.0.0.1:8400
# create base rate limitng config:
cat << EOF > limit.yaml
limits:
- type: queries
value: 100
- type: rows_inserted
value: 100000
- type: new_series
value: 1000
- type: active_series
value: 100000
- type: queries
value: 1
account_id: 15
EOF
# start gateway with clusterMoe
./bin/vmgateway -eula -enable.rateLimit -ratelimit.config limit.yaml -datasource.url http://localhost:8428 -enable.auth -clusterMode -write.url=http://localhost:8480 --read.url=http://localhost:8481
# ingest simple metric to tenant 1:5
curl 'http://localhost:8431/api/v1/import/prometheus' -X POST -d 'foo{bar="baz1"} 123' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2MjAxNjIwMDAwMDAsInZtX2FjY2VzcyI6eyJ0ZW5hbnRfaWQiOnsiYWNjb3VudF9pZCI6MTV9fX0.PB1_KXDKPUp-40pxOGk6lt_jt9Yq80PIMpWVJqSForQ'
# read metric from tenant 1:5
curl 'http://localhost:8431/api/v1/labels' -H 'Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2MjAxNjIwMDAwMDAsInZtX2FjY2VzcyI6eyJ0ZW5hbnRfaWQiOnsiYWNjb3VudF9pZCI6MTV9fX0.PB1_KXDKPUp-40pxOGk6lt_jt9Yq80PIMpWVJqSForQ'
# check rate limit
```
## Configuration
The shortlist of configuration flags include the following:
```console
-clusterMode
enable this for the cluster version
-datasource.appendTypePrefix
Whether to add type prefix to -datasource.url based on the query type. Set to true if sending different query types to the vmselect URL.
-datasource.basicAuth.password string
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.lookback duration
Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
-datasource.maxIdleConnections int
Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state. (default 100)
-datasource.queryStep duration
queryStep defines how far a value can fallback to when evaluating queries. For example, if datasource.queryStep=15s then param "step" with value "15s" will be added to every query.
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default, system CA is used
-datasource.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
-datasource.tlsServerName string
Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used
-datasource.url string
VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428
-enable.auth
enables auth with jwt token
-enable.rateLimit
enables rate limiter
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-eula
By specifying this flag, you confirm that you have an enterprise license and accept the EULA https://victoriametrics.com/assets/VM_EULA.pdf
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches as they cannot read data files larger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
TCP address to listen for http connections (default ":8431")
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-loggerTimezone string
Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit
-memory.allowedBytes size
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-ratelimit.config string
path for configuration file
-ratelimit.extraLabels array
additional labels, that will be applied to fetchdata from datasource
Supports an array of values separated by comma or specified via multiple flags.
-ratelimit.refreshInterval duration
(default 5s)
-read.url string
read access url address, example: http://vmselect:8481
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
-write.url string
write access url address, example: http://vminsert:8480
```
## TroubleShooting
* Access control:
* incorrect `jwt` format, try https://jwt.io/#debugger-io with our tokens
* expired token, check `exp` field.
* Rate Limiting:
* `scrape_interval` at datasource, reduce it to apply limits faster.
## Limitations
* Access Control:
* `jwt` token must be validated by external system, currently `vmgateway` can't validate the signature.
* RateLimiting:
* limits applied based on queries to `datasource.url`
* only cluster version can be rate-limited.

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

BIN
app/vmgateway/vmgateway.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -14,7 +14,7 @@ import (
// InsertCtx contains common bits for data points insertion.
type InsertCtx struct {
Labels []prompb.Label
Labels sortedLabels
mrs []storage.MetricRow
metricNamesBuf []byte

View File

@@ -0,0 +1,32 @@
package common
import (
"flag"
"sort"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
)
var sortLabels = flag.Bool("sortLabels", false, `Whether to sort labels for incoming samples before writing them to storage. `+
`This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. `+
`For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. `+
`Enabled sorting for labels can slow down ingestion performance a bit`)
// SortLabelsIfNeeded sorts labels if -sortLabels command-line flag is set
func (ctx *InsertCtx) SortLabelsIfNeeded() {
if *sortLabels {
sort.Sort(&ctx.Labels)
}
}
type sortedLabels []prompb.Label
func (sl *sortedLabels) Len() int { return len(*sl) }
func (sl *sortedLabels) Less(i, j int) bool {
a := *sl
return string(a[i].Name) < string(a[j].Name)
}
func (sl *sortedLabels) Swap(i, j int) {
a := *sl
a[i], a[j] = a[j], a[i]
}

View File

@@ -55,6 +55,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
}

View File

@@ -45,6 +45,7 @@ func insertRows(rows []parser.Row) error {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
}

View File

@@ -96,7 +96,8 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
if !*skipMeasurement {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
}
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1139
skipFieldKey := len(r.Measurement) > 0 && len(r.Fields) == 1 && *skipSingleField
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
}
@@ -116,11 +117,13 @@ func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label)
// Skip metric without labels.
continue
}
ic.SortLabelsIfNeeded()
if err := ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, f.Value); err != nil {
return err
}
}
} else {
ic.SortLabelsIfNeeded()
ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels)
labelsLen := len(ic.Labels)
for j := range r.Fields {

View File

@@ -6,6 +6,7 @@ import (
"net/http"
"strings"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
@@ -19,6 +20,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/relabel"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/influxutils"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
@@ -34,7 +36,7 @@ import (
var (
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<victoriametrics>:8428/write`")
"This flag isn't needed when ingesting data over HTTP - just send it to http://<victoriametrics>:8428/write")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
"Usually :4242 must be set. Doesn't work if empty")
@@ -43,8 +45,8 @@ var (
)
var (
influxServer *influxserver.Server
graphiteServer *graphiteserver.Server
influxServer *influxserver.Server
opentsdbServer *opentsdbserver.Server
opentsdbhttpServer *opentsdbhttpserver.Server
)
@@ -55,12 +57,12 @@ func Init() {
storage.SetMaxLabelsPerTimeseries(*maxLabelsPerTimeseries)
common.StartUnmarshalWorkers()
writeconcurrencylimiter.Init()
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
}
if len(*graphiteListenAddr) > 0 {
graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
}
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
}
if len(*opentsdbListenAddr) > 0 {
opentsdbServer = opentsdbserver.MustStart(*opentsdbListenAddr, opentsdb.InsertHandler, opentsdbhttp.InsertHandler)
}
@@ -73,12 +75,12 @@ func Init() {
// Stop stops vminsert.
func Stop() {
promscrape.Stop()
if len(*influxListenAddr) > 0 {
influxServer.MustStop()
}
if len(*graphiteListenAddr) > 0 {
graphiteServer.MustStop()
}
if len(*influxListenAddr) > 0 {
influxServer.MustStop()
}
if len(*opentsdbListenAddr) > 0 {
opentsdbServer.MustStop()
}
@@ -90,13 +92,16 @@ func Stop() {
// RequestHandler is a handler for Prometheus remote storage write API
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
startTime := time.Now()
defer requestDuration.UpdateDuration(startTime)
path := strings.Replace(r.URL.Path, "//", "/", -1)
switch path {
case "/prometheus/api/v1/write", "/api/v1/write":
prometheusWriteRequests.Inc()
if err := promremotewrite.InsertHandler(r); err != nil {
prometheusWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -105,7 +110,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
vmimportRequests.Inc()
if err := vmimport.InsertHandler(r); err != nil {
vmimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -114,7 +119,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -123,7 +128,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
prometheusimportRequests.Inc()
if err := prometheusimport.InsertHandler(r); err != nil {
prometheusimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -132,7 +137,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
nativeimportRequests.Inc()
if err := native.InsertHandler(r); err != nil {
nativeimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -141,16 +146,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
influxWriteRequests.Inc()
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "%s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/influx/query", "/query":
// Emulate fake response for influx query.
// This is required for TSBS benchmark.
influxQueryRequests.Inc()
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
influxutils.WriteDatabaseNames(w)
return true
case "/prometheus/targets", "/targets":
promscrapeTargetsRequests.Inc()
@@ -184,6 +187,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
var (
requestDuration = metrics.NewHistogram(`vminsert_request_duration_seconds`)
prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="promremotewrite"}`)

View File

@@ -65,6 +65,7 @@ func insertRows(block *parser.Block, extraLabels []prompbmarshal.Label) error {
// Skip metric without labels.
return nil
}
ic.SortLabelsIfNeeded()
ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels)
values := block.Values
timestamps := block.Timestamps

View File

@@ -45,6 +45,7 @@ func insertRows(rows []parser.Row) error {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
}

View File

@@ -63,6 +63,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
}

View File

@@ -60,6 +60,7 @@ func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
// Skip metric without labels.
continue
}
ctx.SortLabelsIfNeeded()
if err := ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value); err != nil {
return err
}

Some files were not shown because too many files have changed in this diff Show More