Compare commits

...

302 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
efc8e3c523 Makefile: suppress false positives for golangci-lint on nil pointer dereference 2020-05-07 19:41:42 +03:00
Aliaksandr Valialkin
51291015a5 app/vmagent: return 200 from /-/reload endpoint as Prometheus does 2020-05-07 19:30:30 +03:00
Aliaksandr Valialkin
099e44005b lib/httpserver: add -http.shutdownDelay flag for a grace period before http server shutdown
The http server returns 503 non-OK error at `/health` page during grace period,
so load balancers in front of the http server could re-route incoming requests
to other servers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/463
2020-05-07 15:30:35 +03:00
Aliaksandr Valialkin
787fcfba0c lib/httpserver: reduce typical duration for http server graceful shutdown
Previously the duration for graceful shutdown for http server could take more than a minute
because of imporperly set timeouts in setNetworkTimeout.
Now typical duration for graceful shutdown should be reduced to less than 5 seconds.
2020-05-07 14:12:39 +03:00
Aliaksandr Valialkin
6afb25fd08 docs/{vmagent,vmauth}: small clarifications in the docs 2020-05-07 12:55:20 +03:00
Aliaksandr Valialkin
653d51694a app/vmauth: prevent from attacks with .. in path for accessing resources outside the configured url_prefix 2020-05-07 12:55:18 +03:00
Aliaksandr Valialkin
91a49eecea lib/flagutil: make errcheck happy by explicitly ignoring Array.Set result in tests 2020-05-06 22:37:39 +03:00
Aliaksandr Valialkin
c4c447507d lib/flagutil: properly parse quoted flag values for flagutil.Array 2020-05-06 22:27:21 +03:00
Aliaksandr Valialkin
8a00807f60 app/vmagent: allow setting independent auth configs per each configured -remoteWrite.url 2020-05-06 16:51:41 +03:00
Aliaksandr Valialkin
b69eb7bf38 app/vmagent: properly set client-side TLS certificates for -remoteWrite.url. Previously they were mistakenly set as server-side 2020-05-06 16:50:30 +03:00
Aliaksandr Valialkin
68928bf3df lib/promscrape/discovery/gce: discover per-zone instances for gce_sd_config in parallel. This should reduce discovery latency 2020-05-06 15:00:09 +03:00
Aliaksandr Valialkin
e8936c9cb3 docs/vmagent.md: small fixes 2020-05-06 14:49:18 +03:00
Aliaksandr Valialkin
3f52a97f9b lib/promscrape: add Prometheus-compatible DNS-based service discovery aka dns_sd_configs 2020-05-06 00:01:58 +03:00
Aliaksandr Valialkin
364789c24c lib/promscrape: properly connect to TCP6 addresses if -enableTCP6 is set 2020-05-06 00:01:57 +03:00
Aliaksandr Valialkin
08320cfcf4 docs/{vmauth,vmagent}: fix ports for profiling 2020-05-05 20:15:47 +03:00
Aliaksandr Valialkin
f65930b34d docs/vmauth.md: mention that we can help creating customized proxy 2020-05-05 12:34:42 +03:00
Aliaksandr Valialkin
266327642b docs/{vmagent,vmauth}: add Profiling section 2020-05-05 11:45:13 +03:00
Aliaksandr Valialkin
0c7cddfca6 docs: add vmauth.md 2020-05-05 11:17:23 +03:00
Aliaksandr Valialkin
e767aedd17 app/vmauth: add initial version of vmauth. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md for details 2020-05-05 10:54:17 +03:00
Aliaksandr Valialkin
b5a780930d docs/vmagent.md: /targets page doesnt expose infomration about imporperly configured scrape configs now. It is written in error log instead 2020-05-05 10:54:14 +03:00
Aliaksandr Valialkin
7b5ef63384 lib/procutil: add NewSighupChan function, which returns a channel, which is triggered on every SIGHUP 2020-05-05 10:54:09 +03:00
Aliaksandr Valialkin
1aea001532 docs/vmalert.md: sync with app/vmalert/README.md 2020-05-05 07:50:57 +03:00
Aliaksandr Valialkin
4fa817be10 lib/promscrape: allow explicitly setting empty token via token: "" in consul_sd_config 2020-05-05 07:50:15 +03:00
Aliaksandr Valialkin
8c77faec96 make vendor update 2020-05-05 00:54:38 +03:00
Roman Khavronenko
0ba1b5c71b app/vmalert: restore alerts state from datasource metrics (#461)
* app/vmalert: restore alerts state from datasource metrics

Vmalert will restore alerts state for rules that have `rule.For` > 0 from previously written timeseries via `remotewrite.url` flag.

* app/vmalert: mention remotewerite and remoteread configuration in README
2020-05-05 00:51:22 +03:00
Aliaksandr Valialkin
40c3ffb359 lib/promscrape: add Prometheus-compatible service discovery for Consul aka consul_sd_configs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/330
2020-05-04 20:51:17 +03:00
Aliaksandr Valialkin
83f0e35b7b lib/promauth: properly set up client certificate in tls.Config
Previously the client certificate has been mistakenly set up as a server certificate
2020-05-04 20:51:08 +03:00
Aliaksandr Valialkin
218e566647 lib/promscrape: move common code for discovery api config map handling into discoveryutils 2020-05-04 20:51:01 +03:00
Aliaksandr Valialkin
6310b20e72 lib/promscrape/discovery/kubernetes/: unify apiConfig creation 2020-05-04 20:50:49 +03:00
Aliaksandr Valialkin
d17381037e vendor: update github.com/valyala/quicktemplate from v1.4.1 to v1.5.0 2020-05-04 01:36:41 +03:00
Aliaksandr Valialkin
6c68b8aa81 docs/Single-server-VictoriaMetrics.md: mention that it is recommended upgrading to the latest release before reporting issues 2020-05-04 00:41:47 +03:00
Aliaksandr Valialkin
23010e6321 docs/Cluster-VictoriaMetrics.md: add Multitenancy chapter 2020-05-03 18:01:26 +03:00
Aliaksandr Valialkin
66b0ae79a5 lib/promscrape: remove debug line left after the commit e4aac6ea40 2020-05-03 17:15:32 +03:00
Aliaksandr Valialkin
69004a5f67 lib/promscrape: fix tests after the commit 658a8742ac
The original commit copies `__address__` label to `instance` label when generating per-target labels as Prometheus does.

See https://www.robustperception.io/life-of-a-label for details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/453
2020-05-03 16:56:15 +03:00
DexterZhang
658a8742ac fix(vmagent): different behavior as how prometheus deal with labels. [Issue#453] (#454) 2020-05-03 16:51:03 +03:00
Aliaksandr Valialkin
e4aac6ea40 lib/promscrape: make consistent scrape time offsets across reloads for the same ScrapeURL and Labels
This should make consistent intervals between data points for scrape targets across reloads.
Previously these intervals were random.
2020-05-03 14:30:21 +03:00
Aliaksandr Valialkin
d9f1b4d6a3 lib/promscrape: fix TestGetFileSDScrapeWorkSuccess after 3b234d82e5 2020-05-03 14:28:31 +03:00
Aliaksandr Valialkin
3b234d82e5 lib/promscrape: reload only modified scrapers on config changes
This should improve scrape stability when big number of targets are scraped and these targets are frequently changed.

Thanks to @xbsura for the idea and initial implementation attempts at the following pull requests:

- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/449
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/458
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/459
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/460
2020-05-03 12:45:40 +03:00
Aliaksandr Valialkin
d2af2c8c3e docs/MetricsQL.md: document first_over_time and last_over_time functions 2020-05-02 22:43:29 +03:00
Aliaksandr Valialkin
ee810e5f3a lib/httpserver: rename http.externalURL to http.pathPrefix and improve help message for this flag
The `http.externalURL` flag name was slightly misleading, so it has been renamed to `http.pathPrefix`.
2020-05-02 13:07:34 +03:00
DexterZhang
34743974d5 feat(httpserver): add http.externalUrl config to http server, it adds prefix to http path automatically (#452) 2020-05-02 12:42:53 +03:00
Aliaksandr Valialkin
09f5d0056f docs/Single-server-VictoriaMetrics.md: hint that \n is a single newline char 2020-05-01 13:41:55 +03:00
Aliaksandr Valialkin
432187ac3b app/vminsert: add /-/reload handler in the same way as for vmagent 2020-04-30 02:15:39 +03:00
Aliaksandr Valialkin
825a2dd554 lib/procutil: prevent from app termination on SIGHUP signal, since this signal is frequently used for config reload 2020-04-30 02:09:27 +03:00
DexterZhang
67511d4165 feat(vmagent): add promscrap config reload suppport via http (#450)
* feat(vmagent): add promscrap config reload suppport via http endpoint `/-/reload`

* fix: typo fix
2020-04-30 02:00:32 +03:00
Aliaksandr Valialkin
01c17092e1 lib/httpserver: mention that -http.maxGracefulShutdownDuration command-line flag value can be increased on shutdown timeout 2020-04-30 01:38:06 +03:00
Aliaksandr Valialkin
7d36616b93 docs/Single-server-VictoriaMetrics.md: mention that it is better to increase CPU and RAM per vmselect node in order to achieve higher query performance 2020-04-30 00:53:53 +03:00
Aliaksandr Valialkin
d0ebbb166e docs: add vmalert.md 2020-04-29 17:42:06 +03:00
Aliaksandr Valialkin
8b2f54d7cd docs/Single-server-VictoriaMetrics.md: update Alerting section 2020-04-29 17:39:21 +03:00
Aliaksandr Valialkin
5ec036439d lib/promscrape: set 30 seconds timeout for discovery api requests
Previously such requests could hang for long time. This could make debugging harder.
2020-04-29 17:33:34 +03:00
Aliaksandr Valialkin
43c39dc36c vendor: use github.com/VictoriaMetrics/fasthttp instead of github.com/fasthttp/fasthttp
The upstream fasthttp may contain issues like 996610f021 ,
plus a code that isn't used by VictoriaMetrics. So let's use a private copy under our control instead.
2020-04-29 17:33:34 +03:00
Artem Navoiev
cc1878607a fix link to vmalert 2020-04-29 17:17:08 +03:00
Artem Navoiev
d8cd69895c update README.md change alerting section 2020-04-29 17:16:13 +03:00
Artem Navoiev
4487b454a8 Update README.md 2020-04-29 12:39:15 +03:00
Aliaksandr Valialkin
e3cc329d85 vendor: downgrade github.com/valyala/fasthttp from v1.12.0 to v0.1.0
The v0.1.0 points to the last verified changes made by me.
I'm afraid that releases after v0.1.0 may contain completely broken changes like
996610f021
2020-04-29 01:09:02 +03:00
Aliaksandr Valialkin
57407cca83 app/vmselect/promql: remove -search.maxPointsPerTimeseries command-line flag
Limit the estimated time series count after aggregation with grouping by the number of source time series.
2020-04-29 00:20:04 +03:00
Aliaksandr Valialkin
4470308d5b docs/Single-server-VictoriaMetrics.md: mention that basic downsampling could be made with the help of de-duplication 2020-04-28 16:38:32 +03:00
Aliaksandr Valialkin
4e4f57b121 lib/metricsql: move it to a separate repository - github.com/VictoriaMetrics/metrics 2020-04-28 15:28:22 +03:00
Aliaksandr Valialkin
17d96e4503 app/vmselect: add -search.estimatedSeriesCountAfterAggregation command-line flag for tuning the probability of OOMs vs false-positive not enough memory errors 2020-04-28 12:52:37 +03:00
Aliaksandr Valialkin
83aca79137 lib/storage: recover when metricID->metricName entry is missing in the inverted index after unclean shutdown
Newly added index entries can be missing after unclean shutdown, since they didn't flush to persistent storage yet.
Log about this and delete the corresponding metricID, so it could be re-created next time.
2020-04-28 12:00:33 +03:00
Aliaksandr Valialkin
1397612117 app/vmalert: added missing comments for public entities 2020-04-28 11:21:07 +03:00
Aliaksandr Valialkin
20b71acf19 docs/Articles.md: add https://zerodha.tech/blog/infra-monitoring-at-zerodha/ 2020-04-28 02:24:16 +03:00
Aliaksandr Valialkin
521df0e2fc lib/promscrape: handle connection reset when targets responds with http redirect 2020-04-28 02:13:02 +03:00
肖贝贝
2b16c188e8 fix: vmagent not follow 301/302 redirect bug (#445)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-04-28 01:29:37 +03:00
Roman Khavronenko
3bfa41a95c app/vmalert: initial remote-write support for alerts state persistence. (#442)
* app/vmalert: initial remote-write support for alerts state persistence.

If `remotewrite.url` flag is set, vmalert will send alerts state  via remote-write protocol to remote storage. The sending is asynchronous to avoid blocking calls in rules evaluation loop.

* app/vmalert: merge with master

* app/vmalert: write both `instant` and `for` alerts timeseries states in remote storage.
2020-04-28 00:18:02 +03:00
Aliaksandr Valialkin
90670cb55e app/vmalert: include it into the next release 2020-04-28 00:10:12 +03:00
Aliaksandr Valialkin
303905cd84 lib/{encoding,decimal}: typo fixes in tests: epxecting->expecting 2020-04-28 00:01:55 +03:00
Aliaksandr Valialkin
36fa3078c2 lib/encoding: reduce possibility of failure in TestMarshalInt64ArraySize 2020-04-28 00:01:54 +03:00
Aliaksandr Valialkin
95942f1ac6 lib/promscrape/discovery/gce: make golangci-lint happy 2020-04-27 19:28:10 +03:00
Aliaksandr Valialkin
b768bc9a6a lib/promscrape: add initial support for Prometheus-compatible service discovery for Amazon EC2 aka ec2_sd_configs 2020-04-27 19:25:53 +03:00
Aliaksandr Valialkin
de59703a16 lib/promscrape/discovery/gce: properly set filter query arg in api url 2020-04-27 16:01:17 +03:00
Aliaksandr Valialkin
b4afe562c1 lib/storage: postpone reading data from blocks during search
This eliminates the need for storing block data into temporary files on a single-node VictoriaMetrics
during heavy queries, which touch big number of time series over long time ranges.

This improves single-node VM performance on heavy queries by up to 2x.
2020-04-27 11:45:24 +03:00
Aliaksandr Valialkin
0224071ebe lib/promscrape/discovery/gce: allow empty project and zone for gce_sd_config 2020-04-27 11:45:02 +03:00
Aliaksandr Valialkin
fcf57f9883 app/vmselect/netstorage: substitute sorting packedTimeseries with the natural order of the fetched blocks
This should minimize the number of disk seeks when reading data from temporary file.
2020-04-26 16:26:23 +03:00
Aliaksandr Valialkin
6954d0edb7 lib/promscrape/discovery/gce: allow empty zone arg in gce_sd_config - in this case zones for the given project are automatically discovered 2020-04-26 14:34:11 +03:00
kreedom
fb967ae6c8 happy fmt 2020-04-26 14:16:32 +03:00
kreedom
2c18548e08 alert - rename validate function and flags (#440)
* alert - rename validate function and flags
2020-04-26 14:15:04 +03:00
kreedom
5f61d43db9 vmalert - validate template in labels (#439) 2020-04-26 13:53:57 +03:00
肖贝贝
eeadfccdc5 fix: fix vmalert template label not complete bug (#435)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-04-26 13:30:10 +03:00
Aliaksandr Valialkin
d7c1ff8b0c lib/storage: improve deduplication algorithm
Now it leaves only the first data point on each `-dedup.minScrapeInterval` interval.

Previously it may leave two data points on the interval. This could lead to unexpected results
for `histogram_quantile(phi, sum(rate(buckets)) by (le))` query.
2020-04-26 13:10:02 +03:00
Aliaksandr Valialkin
1f3fd93b58 docs/{vmbackup,vmrestore}.md: update -help output 2020-04-24 22:44:21 +03:00
Jason Gardner
66af7e40f3 app/vmbackup: added ability to create and delete snapshots during backup (#428)
* app/vmbackup: added ability to create and delete snapshots during backup

Resolves: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/422

* Add snapshot create and delete url flags

* Fixed errcheck warnings in build
2020-04-24 22:35:03 +03:00
Aliaksandr Valialkin
491b31b369 lib/storage: postpone label filters matching too many time series instead of giving up with error
This should reduce the frequency of the following errors:

    cannot find tag filter matching less than N time series; either increase -search.maxUniqueTimeseries or use more specific tag filters

    more than N time series found on the time range [...]; either increase -search.maxUniqueTimeseries or shrink the time range
2020-04-24 21:13:50 +03:00
Aliaksandr Valialkin
4b84c592e9 docs/Single-server-VictoriaMetrics.md: document -search.resetCacheAuthKey 2020-04-24 19:47:52 +03:00
Aliaksandr Valialkin
a596aec82c app/vmselect: fix description for -search.resetCacheAuthKey 2020-04-24 19:45:50 +03:00
Aliaksandr Valialkin
7b8008e0bd lib/promscrape/discovery/gce: make golint happy by ignoring resp.Body.Close() result 2020-04-24 18:13:09 +03:00
Aliaksandr Valialkin
6d3567d65c .github/workflows: install dependencies before code checkout
Othwerise dependencies' install mangles go.mod
2020-04-24 17:55:17 +03:00
Aliaksandr Valialkin
9ef5935552 lib/promscrape: initial implementation for gce_sd_configs aga Prometheus-compatible service discovery for Google Compute Engine 2020-04-24 17:51:22 +03:00
Aliaksandr Valialkin
b80e6b4d56 .github/workflows: enable Go modules when installing dependencies
Disabled Go modules broke golangci-lint build
2020-04-24 17:39:58 +03:00
Aliaksandr Valialkin
5f9c23226a docs/Single-server-VictoriaMetrics.md: mention that -search.maxStalenessInterval can be useful for InfluxDB and TimescaleDB users 2020-04-24 16:22:50 +03:00
Aliaksandr Valialkin
ac43075cc9 .github/workflows: install golangci-lint at Dependencies step 2020-04-24 15:37:35 +03:00
Aliaksandr Valialkin
3157fb0186 .github/workflows: update Go version in actions/setup-go from v1.13 to v1.14 2020-04-24 15:31:16 +03:00
Aliaksandr Valialkin
e48822942d vendor: make vendor-update 2020-04-24 15:27:45 +03:00
Aliaksandr Valialkin
77bea69fab .github/workflows: use master branch for 'actions/setup-go' and 'actions/checkout' 2020-04-24 14:41:21 +03:00
Aliaksandr Valialkin
24461153bf lib/promscrape: query /api/v1/namespaces/* for the configured namespaces in kubernetes_sd_config
This should fix authroization issues described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/432
2020-04-24 14:33:50 +03:00
Aliaksandr Valialkin
00e897119f lib/promscrape: add -promscrape.configCheckInterval command-line flag for automating config checking
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/431
2020-04-23 23:41:08 +03:00
Aliaksandr Valialkin
a9a7a7175e lib/promscrape: access Config entries by reference, so they can be compared by addresses 2020-04-23 14:38:20 +03:00
Aliaksandr Valialkin
a9b83bf512 vendor: update google.golang.org/api from v0.21.0 to v0.22.0 2020-04-23 14:30:46 +03:00
Aliaksandr Valialkin
a87ca3bdf0 vendor: update github.com/aws/aws-sdk-go from v1.30.8 to v1.30.12 2020-04-23 12:36:03 +03:00
Aliaksandr Valialkin
1c5d14a2eb lib/promscrape: move KubernetesSDConfig to lib/promscrape/discovery/kubernetes 2020-04-23 11:34:22 +03:00
Aliaksandr Valialkin
a714568374 lib/promscrape/discovery/kubernetes: hide role switch logic behind GetLabels function 2020-04-22 22:16:11 +03:00
Aliaksandr Valialkin
364db13c9c app/vmselect: add /api/v1/status/tsdb page with useful stats for locating root cause for high cardinality issues
See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/268
2020-04-22 22:03:43 +03:00
Aliaksandr Valialkin
01e33be34a vendor: update github.com/valyala/fastjson from v1.5.0 to v1.5.1 2020-04-21 00:03:56 +03:00
Aliaksandr Valialkin
78ff5f2aa5 vendor: update github.com/valyala/gozstd from v1.6.4 to v1.7.0 2020-04-20 23:03:40 +03:00
Aliaksandr Valialkin
2dc5593b75 lib/writeconcurrencylimiter: improve docs for -maxConcurrentInserts command-line flag 2020-04-20 21:03:00 +03:00
Aliaksandr Valialkin
9ebc937685 app/vmselect: add -search.minStalenessInterval command-line flag for removing gaps on graphs built from time series with irregular duration between samples
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/426
2020-04-20 19:42:15 +03:00
Aliaksandr Valialkin
fe57d46687 app/vmselect: merge -search.maxLookback and -search.maxStalenessInterval flags, since it has been appeared they have identical purpose :(
Leave both flags for backwards compatibility reasons.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/209
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/426
2020-04-20 19:26:31 +03:00
Aliaksandr Valialkin
6cc6ec6d2e deployment/docker/docker-compose.yml: bump Prometheus from v2.17.1 to v2.17.2 and Grafana from v6.7.1 to v6.7.2 2020-04-20 17:29:20 +03:00
Aliaksandr Valialkin
5454b518a6 lib/promscrape/discovery/kubernetes: reuse a client for empty api_server inside different jobs 2020-04-20 17:07:11 +03:00
Aliaksandr Valialkin
5ecb50d7c2 docs/Single-server-VictoriaMetrics.md: mention about vmagent in the end of Prometheus setup section 2020-04-20 16:41:36 +03:00
Aliaksandr Valialkin
851946af1e deployment/docker: allow building docker images on top of any base image set via ROOT_IMAGE environment var
For example, the following command will build VictoriaMetrics docker image on top of alpine image:

    ROOT_IMAGE=alpine make package-victoria-metrics
2020-04-20 01:16:57 +03:00
Aliaksandr Valialkin
2de76bca96 deployment/docker/base: remove unused group and passwd files 2020-04-19 23:31:31 +03:00
Aliaksandr Valialkin
94ad531bfe Makefile: increase the timeout for make golangci-lint from 1 minute to 2 minutes
This should fix timeout errors on GitHub actions
2020-04-17 19:14:04 +03:00
Aliaksandr Valialkin
936fb0eac3 app/vmagent/remotewrite: retry sending data if the server closes keep-alive connection
This should fix the following error when sending data to remote storage:

couldn't send a block with size XX bytes to "YYY": the server closed connection before returning the first response byte. Make sure the server returns 'Connection: close' response header before closing the connection
2020-04-17 15:52:42 +03:00
Aliaksandr Valialkin
43375df923 lib/promscrape/discovery/kubernetes: update stale comments 2020-04-17 14:06:20 +03:00
Aliaksandr Valialkin
43bbffebb3 vendor: make vendor-update 2020-04-17 13:24:08 +03:00
Aliaksandr Valialkin
79fb595732 docs/vmagent.md: typo fix: unvailable -> unavailable 2020-04-17 13:11:31 +03:00
Aliaksandr Valialkin
546d26523c app/vmagent/README.md: mention about prodmscrape.suppressScrapeErrors 2020-04-17 13:08:21 +03:00
Aliaksandr Valialkin
f41e6a7bd9 app/vmselect: properly apply -search.maxLookback to queries sent to /api/v1/query 2020-04-17 12:30:11 +03:00
Dmitry Shihovtsev
830538e290 Fix misspelled Cortex name in the FAQ (#421) 2020-04-17 08:36:12 +01:00
Aliaksandr Valialkin
5d1537a395 lib/promscrape: suppress scrape errors if -promscrape.suppressScrapeErrors flag is set 2020-04-16 23:41:30 +03:00
Aliaksandr Valialkin
600490131f lib/promscrape: print all the labels for the target on error message for failed scrape
This should improve debuggability.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/420
2020-04-16 23:35:05 +03:00
Aliaksandr Valialkin
bd4c6d21dd lib/promscrape: retry target scraping when the target closes previously established keep-alive connection to it
This should fix the following error:

the server closed connection before returning the first response byte. Make sure the server returns 'Connection: close' response header before closing the connection
2020-04-16 23:25:29 +03:00
Aliaksandr Valialkin
95da8d410c docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics supports Kubernetes service discovery 2020-04-16 18:40:11 +03:00
Aliaksandr Valialkin
bcec5c5429 docs/Single-server-VictoriaMetrics.md: typo fix: unneded -> unneeded 2020-04-16 17:35:08 +03:00
Aliaksandr Valialkin
467279acd2 docs/Single-server-VictoriaMetrics.md: imrpove docs about metrics deletion 2020-04-16 17:32:09 +03:00
Aliaksandr Valialkin
e0d213f82b docs/Single-server-VictoriaMetrics.md: mention that the delete API can be protected by authKey 2020-04-16 17:19:10 +03:00
Aliaksandr Valialkin
2fd2dec5eb lib/logger: typo fix 2020-04-16 00:19:10 +03:00
Aliaksandr Valialkin
071fdf5518 lib/logger: add WARN level for logging expected errors such as invalid user queries 2020-04-15 20:50:26 +03:00
Aliaksandr Valialkin
30b401ebbf docs/Single-server-VictoriaMetrics.md: typo fix 2020-04-15 15:21:58 +03:00
Aliaksandr Valialkin
a59a7bcc5e vendor: make vendor-update 2020-04-15 14:52:24 +03:00
Aliaksandr Valialkin
ccb887c0f6 docs/Single-server-VictoriaMetrics.md: clarify how to use -influxListenAddr command-line option 2020-04-15 12:33:42 +03:00
Aliaksandr Valialkin
6f7f64f757 app/vmselect: handle timestamp(metric offset X) the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/415
2020-04-15 12:01:00 +03:00
Aliaksandr Valialkin
426a0567c4 lib/promscrape: code cleanup in runScraper func 2020-04-15 11:36:24 +03:00
Aliaksandr Valialkin
6e2f6574b8 docs/Single-server-VictoriaMetrics.md: mention that backfilling can be done via any supported ingestion method 2020-04-15 10:56:53 +03:00
Aliaksandr Valialkin
c1de3f67b4 lib/storage: skip metricID if the corresponding metricID->metricName is missing in inverted index during search
This case is possible when the corresponding metricID->metricName entry didn't propagate to inverted index yet.

This should fix the following error:

error when searching tsids for tfss [...]: cannot find metricName by metricID 1582417212213420669: EOF
2020-04-15 00:06:43 +03:00
Aliaksandr Valialkin
8a25c1ed71 docs/Single-server-VictoriaMetrics.md: add https://github.com/Slapper/ansible-victoriametrics-cluster-role to integrations chapter 2020-04-14 16:27:20 +03:00
Aliaksandr Valialkin
067c7afebc lib/promscrape: show information on improperly configured scrape targets at the bottom of /targets page
This is a common error whith improperly configured target autodiscovery and/or relabeling.
This error leads to duplicate scraping of the same targets with the same set of labels, which leads
to duplicate samples in time series.
2020-04-14 14:55:05 +03:00
Aliaksandr Valialkin
ac35635b71 lib/promscrape/discovery/kubernetes: remove only unused client for API server during cleaning 2020-04-14 14:19:21 +03:00
Aliaksandr Valialkin
78863d7066 lib/promscrape: add promrelabel.GetLabelValueByName helper function 2020-04-14 14:12:01 +03:00
Aliaksandr Valialkin
c64f003cfb lib/promscrape: mention job name in error messages when target cannot be scraped
This should improve debuggability
2020-04-14 13:33:13 +03:00
Aliaksandr Valialkin
4718a5d951 lib/promscrape: reset ScrapeWork.ID in tests 2020-04-14 13:31:31 +03:00
Aliaksandr Valialkin
257521a634 lib/promscrape: properly expose statuses for targets with duplicate scrape urls at /targets page
Previously targets with duplicate scrape urls were merged into a single line on the page.
Now each target with duplicate scrape url is displayed on a separate line.
2020-04-14 13:10:01 +03:00
Aliaksandr Valialkin
6a75c95194 lib/promscrape: remove labels starting with __meta_ after applying relabel_configs as Prometheus does
This should reduce CPU load during scraping when target discovery generates
big number of `__meta_*` labels (for instance, k8s discovery).

See https://www.robustperception.io/life-of-a-label for details.
2020-04-14 12:23:22 +03:00
Aliaksandr Valialkin
01d7d799dc lib/promscrape: rename 'scrape_config->scrape_limit' to 'scrape_config->sample_limit'
`scrape_config` block from Prometheus config contains `sample_limit` field,
while in `vmagent` this field was mistakenly named as `scrape_limit`.
2020-04-14 11:59:57 +03:00
Aliaksandr Valialkin
0b76c27fa1 docs/vmagent.md: mention that vmagent supports kubernetes_sd_configs now 2020-04-13 21:06:36 +03:00
Aliaksandr Valialkin
2e4e202c2b lib/promscrape: add initial support for kubernetes_sd_config
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/334
2020-04-13 21:03:28 +03:00
Aliaksandr Valialkin
2814b1490f lib/promscrape: add -promscrape.config.strictParse flag for detecting errors in -promscrape.config file 2020-04-13 13:15:44 +03:00
Aliaksandr Valialkin
90b4a6dd12 lib/promscrape: extract common auth code to lib/promauth 2020-04-13 12:59:10 +03:00
hagen1778
2eed6c393f vmalert: prepare package for external usage
* update README according to changes
* add Makefile with basic commands
2020-04-12 15:32:42 +03:00
kreedom
948f8b6b5f [vmalert] fix linter issues 2020-04-12 15:08:11 +03:00
kreedom
8fca5f2819 [vmalert] add tests to webserver (#413) 2020-04-12 14:51:03 +03:00
Roman Khavronenko
7c9405f53d Vmalert metrics (#412)
vmalert: add basic list of metrics
2020-04-11 20:42:01 +01:00
Roman Khavronenko
9f8cc8ae1b Extend web responses for alerts: (#411)
vmalert: Extend web responses for alerts

* populate apiAlert object with additional fields
* return all active alerts, not only firing
* sort list of API alerts for deterministic output
* add helper for available path list
2020-04-11 16:49:23 +01:00
kreedom
90de3086b3 [vmalert] add webserver (#410)
* [vmalert] add webserver
2020-04-11 12:40:24 +03:00
Aliaksandr Valialkin
830d5fb1e0 vendor: make vendor-update 2020-04-10 18:40:21 +03:00
Aliaksandr Valialkin
66d8086a5e vendor: update github.com/klauspost/compress from v1.10.3 to v1.10.4 2020-04-10 18:39:19 +03:00
Aliaksandr Valialkin
a30c98c0bc deployment/docker: update Go builder image from go1.14.1 to go1.14.2 2020-04-10 18:19:34 +03:00
Aliaksandr Valialkin
4de6c6bbf0 lib/storage: disable deduplication after dedup tests are complete
The rest of tests expect that the de-duplication is disabled.
2020-04-10 17:28:31 +03:00
Aliaksandr Valialkin
ded0c0d3c7 lib/storage: correctly handle -dedup.minScrapeInterval values smaller than 8ms
Such small values may be used for removing samples with duplicate timestamps.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/409 for details.
2020-04-10 16:36:41 +03:00
Aliaksandr Valialkin
7d73623c69 lib/{storage,mergeset}: make sure that requests and misses cache counters never go down 2020-04-10 14:45:01 +03:00
Aliaksandr Valialkin
e62afc7366 lib/protoparser: add -*TrimTimstamp command-line flags for Influx, Graphite, OpenTSDB and CSV data
These flags can be used for reducing disk space usage for timestamps data ingested over the given protocols
2020-04-10 12:44:39 +03:00
Aliaksandr Valialkin
0681b4c27a lib/workingsetcache: accumulate stat counters on cache rotation
This should prevent from cache stats counters going down after cache rotation,
which may corrupt `cache hit ratio` graph on the official Grafan dasbhoards
when using the following query:

    1 - (sum(rate(vm_cache_misses_total[5m])) by (type) / sum(rate(vm_cache_requests_total[5m])) by (type))
2020-04-10 11:51:40 +03:00
Aliaksandr Valialkin
f86947d55c lib/memory: add more details to -memory.allowedPercent help message 2020-04-09 15:28:53 +03:00
Aliaksandr Valialkin
f94a090020 docs: update minimum supported Go version from 1.12 to 1.13 2020-04-07 13:38:37 +03:00
Aliaksandr Valialkin
8064775c02 docs/CaseStudies.md: updated ARNES numbers 2020-04-06 16:20:11 +03:00
Aliaksandr Valialkin
520a704606 docs/CaseStudies.md: prettifying of the formatting 2020-04-06 15:24:37 +03:00
Aliaksandr Valialkin
105f0c78d9 docs/CaseStudies.md: add ARNES case study 2020-04-06 15:17:33 +03:00
Roman Khavronenko
b099d84271 Vmalert/rules eval (#400)
* Initial rules evaluation support.

Rules are now store alerts state in private field `alerts`. Every evaluation updates
the alerts and state. Every unique metric received from datastore represents a unique alert,
uniqueness is guaranteed by hashing ordered labelset.

* merge with master

* cleanup

* support endAt parameter as 3*evaluationInterval for active alerts

* make golint happy
2020-04-06 14:44:03 +03:00
Aliaksandr Valialkin
407bdbf2b9 docs/Single-server-VictoriaMetrics.md: cosmetic fixes in Importing CSV data chapter 2020-04-06 12:29:28 +03:00
Aliaksandr Valialkin
69962a7001 docs/FAQ.md: small fixes 2020-04-05 13:53:08 +03:00
Aliaksandr Valialkin
9f03548e55 docs/FAQ.md: add more articles about VictoriaMetrics performance 2020-04-05 13:48:03 +03:00
Aliaksandr Valialkin
022310f35b docs/Articles.md: added a link to https://www.iunera.com/kraken/fabric/time-series-database/ 2020-04-04 16:40:00 +03:00
Aliaksandr Valialkin
895cadfae7 app/vmagent/remotewrite: add "X-Prometheus-Remote-Write-Version: 0.1.0" http header to remote_write request
This header is required by Cortex (and, probably, other remote storage systems).
See 9c1f44d090/docs/apis.md (remote-api) .

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/399
2020-04-04 16:24:56 +03:00
Aliaksandr Valialkin
57704aa584 app/victoria-metrics: add -selfScrapeInstance and -selfScrapeJob flags for tuning labels for self-scraped metrics 2020-04-04 14:57:22 +03:00
Aliaksandr Valialkin
f9b24d4899 app/vmselect/promql: keep metric name after applying first_over_time and last_over_time functions 2020-04-04 14:54:13 +03:00
Aliaksandr Valialkin
fa0554b771 docs/Articles.md: move Percona article to third-party 2020-04-02 15:43:02 +03:00
Aliaksandr Valialkin
35b133bff4 docs/Articles.md: add a link to https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/ 2020-04-02 15:41:09 +03:00
Aliaksandr Valialkin
a884803377 docs/CaseStudies.md: add Adsterra case 2020-04-02 00:49:16 +03:00
Aliaksandr Valialkin
b38d048dd9 app/vmstorage: add vm_free_disk_space_bytes metric for monitoring the remaining disk space at -storageDataPath 2020-04-01 23:08:58 +03:00
Aliaksandr Valialkin
de2cd4231b docs/Single-server-VictoriaMetrics.md: re-organize chapters 2020-04-01 22:38:56 +03:00
kreedom
298eb0a0f8 [vmalert] improve external url handling 2020-04-01 22:29:11 +03:00
kreedom
12fe915b48 [vmalert] add prometheus template function (#396)
* [vmalert] add prometheus template function

* make linter be happy

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-04-01 18:17:53 +03:00
Aliaksandr Valialkin
cdf0a4cf8f lib/httpserver: remove unnecessary http.HandlerFunc wrapper in gzipHandler 2020-04-01 18:14:17 +03:00
Aliaksandr Valialkin
1c9c57db1c docs/Cluster-VictoriaMetrics.md: small fixes and updates 2020-04-01 18:10:12 +03:00
Aliaksandr Valialkin
8edc72201d docs/Single-server-VictoriaMetrics.md: small fixes and updates 2020-04-01 18:09:07 +03:00
Aliaksandr Valialkin
b024ecd10c docs/Cluster-VictoriaMetrics.md: swap production build and development build chapters 2020-04-01 17:49:51 +03:00
Aliaksandr Valialkin
e0d0348f36 lib/storage: add missing reset for tagFilter.matchesEmptyValue on tagFilter.Init 2020-04-01 17:42:44 +03:00
Aliaksandr Valialkin
3e55c7e069 lib/promscrape: reduce timestamp jitter when scraping targets
This should improve compression for timestamps
2020-04-01 16:11:35 +03:00
Aliaksandr Valialkin
c4acd20d2a lib/storage: remove duplicate data points on 7/8*minScrapeInterval interval instead of 1/2*minScrapeInterval
This should reduce storage usage and should improve deduplication accuracy
2020-04-01 15:48:48 +03:00
Aliaksandr Valialkin
8661dc4624 docs/Single-server-VictoriaMetrics.md: mention that environment vars may be prefixed with -envflag.prefix 2020-03-31 22:37:44 +03:00
Aliaksandr Valialkin
16572c8722 README.md: mention that response cache must be reset after import historical data 2020-03-31 19:33:20 +03:00
Aliaksandr Valialkin
b699c46046 lib/storage: handle errors returned from TagFilters.Add when cloning TagFilters with negative filter 2020-03-31 16:18:02 +03:00
Aliaksandr Valialkin
e71519b8b2 app/victoria-metrics/testdata: add a test for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/395 2020-03-31 12:51:25 +03:00
Aliaksandr Valialkin
972713bd79 lib/storage: add fast path for the previous indexdb search if it doesn't contain per-day inverted index yet 2020-03-31 12:51:21 +03:00
Aliaksandr Valialkin
5d99ca6cfc lib/storage: optimize per-day inverted index search for tag filters matching big number of time series
- Sort tag filters in the ascending number of matching time series
  in order to apply the most specific filters first.
- Fall back to metricName search for filters matching big number of time series
  (usually this are negative filters or regexp filters).
2020-03-31 00:48:35 +03:00
Aliaksandr Valialkin
318326c309 lib/storage: properly handle {label=~"foo|"} filters as Prometheus does
Such filters must match all the time series with `label="foo"` plus all the time series without `label`

Previously only time series with `label="foo"` were matched.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/395
2020-03-31 00:48:18 +03:00
Aliaksandr Valialkin
a1e4c6a2be .github/workflows/wiki.yml: fix copying files from docs to wiki 2020-03-30 15:59:12 +03:00
Aliaksandr Valialkin
ac3ee44fa7 docs/robots.txt: trigger github actions 2020-03-30 15:54:39 +03:00
Aliaksandr Valialkin
b98ca56d94 lib/envflag: add -envflag.prefix for setting optional prefix for environment vars 2020-03-30 15:51:19 +03:00
Aliaksandr Valialkin
b41ee5f27d vendor: make vendor-update 2020-03-30 15:06:35 +03:00
Aliaksandr Valialkin
8d35af6fdb .github/workflows: copy all the files from docs folder to wiki and github pages 2020-03-30 15:05:37 +03:00
Aliaksandr Valialkin
0f2dd77a76 go.mod: update the minimum required Go version from go1.12 to go1.13 2020-03-30 14:56:57 +03:00
Aliaksandr Valialkin
0c485f14d1 app/vmselect/prometheus: allow passing relative time to start, end and time args of /api/v1/* queries 2020-03-29 21:57:14 +03:00
Aliaksandr Valialkin
2ebf7d86ff app/vmselect/prometheus: code simplification: (d.Seconds()/1e3) -> d.Milliseconds() 2020-03-29 21:50:28 +03:00
kreedom
bf6c24d0f4 [vmalert] config parser (#393)
* [vmalert] config parser

* make linter be happy

* fix test

* fix sprintf add test for rule validation
2020-03-29 01:48:30 +02:00
Aliaksandr Valialkin
1f7292675a docs: add robots.txt 2020-03-28 23:22:46 +02:00
Aliaksandr Valialkin
bd156cd088 docs/vmagent.md: add prometheus remote_write proxy use case 2020-03-28 23:16:38 +02:00
Aliaksandr Valialkin
b695087119 docs/CaseStudies.md: add Brandwatch case study 2020-03-28 20:57:54 +02:00
Aliaksandr Valialkin
80f53e5396 deployment/docker: run docker apps under default user (0, root) in order to preserve backwards compatibility
If docker app is upgraded from root to non-root, then the data pointed by `-storageDataPath` or similar flags
becomes denied to non-root user after the upgrade. This breaks upgrade path. So revert back to default root user
for docker apps.

Users may explicitly execute `docker run --user <non_root_user>` for running docker apps under non-root user.
2020-03-28 19:23:26 +02:00
Roman Khavronenko
7acb797595 Update dashboard according to new Grafana version. (#390)
The way how regex for column style in Table panel should be applied has changed in 6.7 Grafana version. The change supposed to fix Flags panel column styles accordingly.
2020-03-28 01:24:39 +02:00
Roman Khavronenko
3a8bbfd6b9 bump Prometheus and Grafana images (#389) 2020-03-28 01:15:07 +02:00
Dmitry Naumov
27373807c1 Rootless docker images by default (#358)
* Rootless docker images by default

* Migrate to rootless base image

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-03-27 21:23:50 +02:00
Aliaksandr Valialkin
8d7f0aa632 vendor: make vendor-update 2020-03-27 21:23:30 +02:00
Aliaksandr Valialkin
149f365f74 lib/httpserver: add -http.maxGracefulShutdownDuration command-line flag for tuning the maximum duration required for graceful shutdown of http server 2020-03-27 21:23:30 +02:00
kreedom
b22da547a2 [vmalert] - parse template annotaions (#387)
* [vmalert] - parse template annotations
2020-03-27 18:31:16 +02:00
Aliaksandr Valialkin
047849e855 lib/uint64set: remove zero buckets after Set.Intersect 2020-03-27 01:15:58 +02:00
Aliaksandr Valialkin
f3ec424e7d lib/uint64set: small code cleanup and perf tuning
* Remember the last accessed bucket on Has() call.
* Inline fast paths inside Add() and Has() calls.
* Remove fragile code with maxUnsortedBuckets inside bucket32.
2020-03-25 15:30:25 +02:00
Aliaksandr Valialkin
ef8aee8a2d deployment/docker: update Go builder from Go1.14.0 to Go1.14.1 2020-03-24 22:35:26 +02:00
Aliaksandr Valialkin
dde4a97534 lib/uint64set: go fmt 2020-03-24 22:28:43 +02:00
Aliaksandr Valialkin
f3e0c55ea1 lib/storage: serialize snapshot creation process with mutex
This guarantees that the snapshot contains all the recently added data
from inmemory buffers when multiple concurrent calls to Storage.CreateSnapshot are performed.
2020-03-24 22:27:05 +02:00
Aliaksandr Valialkin
97fb0edd07 lib/uint64set: added more tests 2020-03-24 22:27:04 +02:00
Aliaksandr Valialkin
25f585ecf2 docs/CaseStudies.md: added a case study from MHI Vestas Offshore Wind 2020-03-14 13:22:12 +02:00
Aliaksandr Valialkin
df91d2d91f lib/storage: remove obsolete code 2020-03-13 22:48:17 +02:00
Aliaksandr Valialkin
3c7c71a49c app/vmselect: adjust label_map() handling for corner cases
The following corner cases now supported:
* label_map(q, "label", "", "foo") - adds `label="foo"` to series with missing `label`
* label_map(q, "label", "foo", "") - removes `label="foo"` from series

All the unmatched labels are kept unchanged.
2020-03-13 18:45:03 +02:00
Aliaksandr Valialkin
69f1470692 vendor: update github.com/VictoriaMetrics/metrics from v1.11.0 to v1.11.2
This fixes data race in Histogram
2020-03-13 12:39:57 +02:00
Aliaksandr Valialkin
4fc4912f0c app/vmalert/datasource: typo fix in docs: Labels -> Label 2020-03-13 12:22:33 +02:00
kreedom
a746cb62b6 vmalert add vm datasource, change alertmanager (#364)
* vmalert add vm datasource, change alertmanager

* make linter be happy

* make linter be happy.2

* PR comments

* PR comments.1
2020-03-13 12:19:31 +02:00
Aliaksandr Valialkin
499594f421 lib/promscrape: allow overriding external_labels as Prometheus does
Prometheus docs at https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config say:

> In communication with external systems, they are always applied only
> when a time series does not have a given label yet and are ignored otherwise.

Though this may result in consistency chaos when scrape targets override `external_labels`,
let's stick with Prometheus behavior for the sake of backwards compatibility.

There is last resort in vmagent with `-remoteWrite.label`, which consistently
sets the configured labels to all the metrics before sending them to remote storage.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/366
2020-03-12 20:24:42 +02:00
Aliaksandr Valialkin
fdc2a9d1d7 app/vmselect: add label_map(q, label, srcValue1, dstValue1, ... srcValueN, dstValueN) function to MetricsQL
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/369
2020-03-12 19:13:47 +02:00
Aliaksandr Valialkin
92d67e2592 vendor: update google.golang.org/genproto from fc8f55426688 to da6875a35672 2020-03-12 18:11:33 +02:00
Aliaksandr Valialkin
8a853778d7 vendor: update golang.org/x/tools from 26f6a1b6802d to 5e2df02acb1e 2020-03-12 18:07:52 +02:00
Aliaksandr Valialkin
8d75a5dbd0 vendor: update github.com/aws/aws-sdk-go from v1.29.10 to v1.29.22 2020-03-12 17:54:58 +02:00
Aliaksandr Valialkin
cdd6171af1 vendor: update google.golang.org/api from v0.19.0 to v0.20.0 2020-03-12 17:51:49 +02:00
Aliaksandr Valialkin
cc183bc899 vendor: update golang.org/x/sys from d5e6a3e2c0ae to 5c8b2ff67527 2020-03-12 17:46:24 +02:00
Aliaksandr Valialkin
3935038e20 vendor: update github.com/klauspost/compress from v1.10.1 to v1.10.3 2020-03-12 17:32:24 +02:00
Aliaksandr Valialkin
c8dc1cd218 lib/protoparser/csvimport: add missing metric vm_rows_invalid_total{type="csvimport"} 2020-03-12 15:27:45 +02:00
Aliaksandr Valialkin
c1551a3269 README.md: mention about alternative dashboard for cluster version - https://grafana.com/grafana/dashboards/11831 2020-03-12 15:10:14 +02:00
Aliaksandr Valialkin
8023ad7dbd app/vmselect: add -search.maxStalenessInterval for tuning Prometheus data model closer to Influx-style data model 2020-03-11 16:43:34 +02:00
Aliaksandr Valialkin
d4beb17ebe lib/promscrape: remove possible races when registering and de-registering scrape workers for /targets page 2020-03-11 16:30:21 +02:00
Aliaksandr Valialkin
fcd91795d5 app/vmagent: mention that vmagent can filter data 2020-03-11 16:22:39 +02:00
Aliaksandr Valialkin
650830db79 docs/Articles.md: add a link to https://stas.starikevich.com/posts/disk-usage-for-vm-versus-prometheus/ 2020-03-11 04:56:16 +02:00
Aliaksandr Valialkin
cdf70b7944 lib/promscrape: consistently update /targets page after SIGHUP 2020-03-11 03:20:03 +02:00
Aliaksandr Valialkin
301c2acd61 app/vmstorage: return 500 status code instead of 200 status code on internal errors inside /snapshot/* handlers 2020-03-10 23:51:55 +02:00
Aliaksandr Valialkin
61d0ee857c docs/vmagent.md: sync with app/vmagent/README.md 2020-03-10 21:54:04 +02:00
Aliaksandr Valialkin
e17702fada app/vmselect: add optional max_rows_per_line query arg to /api/v1/export
This arg allows limiting the number of data points that may be exported on a single line.
2020-03-10 21:45:56 +02:00
Aliaksandr Valialkin
1fe66fb3cc app/{vmagent,vminsert}: add support for importing csv data via /api/v1/import/csv 2020-03-10 21:15:35 +02:00
Aliaksandr Valialkin
49d7cb1a3f all: fix golangci-lint issues 2020-03-10 19:41:46 +02:00
Aliaksandr Valialkin
8d3869cd99 docs/FAQ.md: actualize answer about deduplication 2020-03-09 13:37:12 +02:00
Aliaksandr Valialkin
9d89b08cb5 docs: add missing vmagent.png, which is used in vmagent.md 2020-03-09 13:35:49 +02:00
Aliaksandr Valialkin
5fe38a84eb app/vmagent: properly apply -remoteWrite.sendTimeout to fasthttp.HostClient 2020-03-09 13:31:55 +02:00
Aliaksandr Valialkin
7c432da788 lib/promscrape: do not retry idempotent requests when scraping targets
This should prevent from the following unexpected side-effects of idempotent request retries:
- increased actual timeout when scraping the target comparing to the configured scrape_timeout
- increased load on the target

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/357
2020-03-09 13:31:52 +02:00
Aliaksandr Valialkin
986dba5ab3 app/vmagent: do not allow non-supported fields in -remoteWrite.relabelConfig and file_sd_configs
This should reduce possible confusion like in the https://github.com/VictoriaMetrics/VictoriaMetrics/issues/363
2020-03-06 20:19:13 +02:00
Aliaksandr Valialkin
c386c5de57 app/vmagent: properly add labels set via -remoteWrite.label to metrics before sending them to -remoteWrite.url 2020-03-06 19:26:58 +02:00
Artem Navoiev
58a3e59d59 bump version of codecov-action to v1.0.6 2020-03-05 23:25:13 +02:00
Aliaksandr Valialkin
c5f894b361 Makefile: add build and test rules with enabled race detector. These rules have -race suffix
Fix also `unsafe pointer conversion` errors detected by Go1.14. See https://golang.org/doc/go1.14#compiler .
2020-03-05 12:03:38 +02:00
Aliaksandr Valialkin
9be64e34b4 docs/Articles.md: add a link to https://www.percona.com/blog/2020/02/28/better-prometheus-rate-function-with-victoriametrics/ 2020-03-04 20:05:26 +02:00
Aliaksandr Valialkin
e51a0a56f4 README.md: add a link to https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Articles 2020-03-04 20:05:18 +02:00
Aliaksandr Valialkin
754db0d22e app/vmagent/README.md: small fixes 2020-03-04 18:14:47 +02:00
Aliaksandr Valialkin
772312bf7b app/vmagent/README.md: typo fix 2020-03-04 18:05:09 +02:00
Aliaksandr Valialkin
871abfab7a app/vmagent/README.md: clarification 2020-03-04 18:03:48 +02:00
Aliaksandr Valialkin
007c591de8 app/vmagent/README.md: add iot and edge monitoring use case 2020-03-04 18:01:34 +02:00
Aliaksandr Valialkin
474a09c0f1 app/vmagent/README.md: add use cases section 2020-03-04 17:42:27 +02:00
Aliaksandr Valialkin
d58aa80e9b README.md: add a link to Synthesio case study 2020-03-04 14:18:19 +02:00
Aliaksandr Valialkin
ad927575b7 docs/CaseStudies: add Synthesio 2020-03-04 14:14:39 +02:00
Aliaksandr Valialkin
0b1e877a7d docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-03-03 21:39:05 +02:00
Aliaksandr Valialkin
0ba8ee6022 README.md: mention -search.cacheTimestampOffset in Backfilling section 2020-03-03 21:38:39 +02:00
Aliaksandr Valialkin
9a944fd169 lib/promscrape: consistency renaming: stopCh -> globalStopCh 2020-03-03 20:08:08 +02:00
Aliaksandr Valialkin
032c88561b app/vminsert/prompush: limit memory usage by pushing promscrape data in smaller blocks 2020-03-03 19:58:54 +02:00
Aliaksandr Valialkin
76036c1897 app/vmagent: add -remoteWrite.maxDiskUsagePerURL for limiting the maximum disk usage for each -remoteWrite.url buffer
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/352
2020-03-03 19:49:07 +02:00
Aliaksandr Valialkin
c31d640eb9 app/vmagent/remotewrite: do not reset empty relabelCtx 2020-03-03 15:01:03 +02:00
Aliaksandr Valialkin
02b55c72dc app/vmagent: add -remoteWrite.urlRelabelConfig for applying individual relabeling for each -remoteWrite.url
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/320
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/308
2020-03-03 13:12:16 +02:00
Aliaksandr Valialkin
1d7ab78b55 lib/protoparser/prometheus: allow trailing comma in tags list
The trailing comma is generated by cloudwatch exporter.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/350
2020-03-02 22:22:09 +02:00
Aliaksandr Valialkin
7d178a40bd app/vmselect/prometheus: do not add __name__!= filter when searching for all the matching metric names via /api/v1/label/__name__/values with non-empty label filter
This should reduce query time.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/343
2020-02-28 23:35:55 +02:00
Aliaksandr Valialkin
43754ff420 README.md: put https://gitlab.com/optima_public/prometheus_oauth_proxy in third-party contributions section 2020-02-28 21:23:34 +02:00
Aliaksandr Valialkin
b785429ddb lib/protoparser: metrics renaming: vm_protoparser_<type>_* -> vm_protoparser_*{type="<type>"}
This should improve composability of these metrics in PromQL queries
2020-02-28 20:20:10 +02:00
Aliaksandr Valialkin
f9a584b5c1 app/vmagent/remotewrite: yet another typo fix 2020-02-28 20:05:55 +02:00
Aliaksandr Valialkin
e22fdc1073 lib/persistentqueue: reset chunk file when the persistent queue is empty 2020-02-28 20:05:53 +02:00
Aliaksandr Valialkin
b9b46cb8dc app/vmagent/remotewrite: typo fix 2020-02-28 19:03:16 +02:00
Aliaksandr Valialkin
db6f4e4af1 app/vmagent/remotewrite: limit memory usage when big scrape blocks are pushed to remote storage 2020-02-28 18:58:01 +02:00
Aliaksandr Valialkin
8cc88db38d docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-02-28 12:58:32 +02:00
Aliaksandr Valialkin
f3c28d2ae4 README.md: typo fix 2020-02-28 12:58:31 +02:00
Aliaksandr Valialkin
57528ca31c docs: add a doc for vmagent 2020-02-28 12:23:56 +02:00
Aliaksandr Valialkin
5701b2f7bb app/vmselect/prometheus: properly pass filter for labelName=__name__ in labelValuesWithMatches
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/343
2020-02-28 12:18:14 +02:00
Aliaksandr Valialkin
18af31a4c2 all: properly split vm_deduplicated_samples_total among cluster components
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/345
2020-02-27 23:48:07 +02:00
Aliaksandr Valialkin
6819db5686 lib/envflag: typo fix in docs to -envflag.enable: envoronment->environment 2020-02-27 21:47:58 +02:00
Aliaksandr Valialkin
63a88a619b deployment/docker: update Go builder from Go1.13.8 to Go1.14.0 2020-02-26 22:15:44 +02:00
Aliaksandr Valialkin
c458b521a2 app/vmagent: allow setting -httpListenAddr to empty string in order to disable listening for http requests
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/340
2020-02-26 20:58:11 +02:00
Aliaksandr Valialkin
b459919250 make vendor-update 2020-02-26 20:45:27 +02:00
Aliaksandr Valialkin
cc5fe0b315 vendor: update github.com/VictoriaMetrics/metrics from v1.10.1 to v1.11.0 2020-02-26 20:41:02 +02:00
Aliaksandr Valialkin
117c76311c app/vmagent/README.md: list service discovery mechanisms, which will be added soon
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/334
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/330
2020-02-26 19:27:08 +02:00
Aliaksandr Valialkin
b63e4464f4 lib/promscrape: properly reload new configs on SIGHUP
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/335
2020-02-26 13:54:00 +02:00
Edouard Hur
3ad36134f6 Readme markdown linting (#338)
* fixed MD009/no-trailing-spaces

* fixed MD033/no-inline-html: Inline HTML

* fixed MD012/no-multiple-blanks

* fixed MD007/ul-indent

* fixed MD004/ul-style

* fixed MD031/blanks-around-fences

* fixed MD040/fenced-code-language

* fixed MD032/blanks-around-lists

* fixed MD026/no-trailing-punctuation
2020-02-26 13:21:19 +02:00
Edouard Hur
1f0007d0b1 Readme envvars (#332)
* add details about env vars config

* add env var to table of contents

* remove unnecessary words
2020-02-25 22:41:34 +02:00
Aliaksandr Valialkin
6739c2749d lib/promscrape: go fmt 2020-02-25 20:56:44 +02:00
Aliaksandr Valialkin
7a33da8fea lib/promscrape: do not add missing port to __address__ label in order to be consistent with Prometheus behavior
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/331
2020-02-25 20:49:50 +02:00
Aliaksandr Valialkin
be37d762cd app/vmagent: add -remoteWrite.maxBlockSize command-line flag for limiting the maximum size of unpacked block to send to remote storage 2020-02-25 19:57:47 +02:00
Aliaksandr Valialkin
4e24839a2c app/vmagent: do not allow sending unpacked requests with sizes exceeding -maxInsertRequestSize 2020-02-25 19:34:41 +02:00
Aliaksandr Valialkin
6386aeb1e0 app/vmagent: add ability to accept Influx line protocol data via TCP and UDP
Just set `-influxListenAddr` command-line flag

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/333
2020-02-25 19:12:49 +02:00
Aliaksandr Valialkin
e453880084 app/vmagent/README.md: mention that vmagent exposes target statuses at /targets page 2020-02-25 18:15:58 +02:00
Aliaksandr Valialkin
4c4448b66e app/vminsert: add /targets handler, which exposes Prometheus targets defined in -promscrape.config file 2020-02-25 18:13:11 +02:00
Aliaksandr Valialkin
7ef7c9368e lib/fs: typo fix: read blocks bigger than 8KB via pread() call instead of using mmap 2020-02-25 18:05:06 +02:00
871 changed files with 96543 additions and 132818 deletions

View File

@@ -2,7 +2,7 @@ name: github-pages
on:
push:
paths:
- 'docs/*.md'
- 'docs/*'
- 'README.md'
branches:
- master
@@ -17,14 +17,14 @@ jobs:
TOKEN: ${{secrets.CI_TOKEN}}
run: |
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git gpages
cp docs/*.md gpages
cp docs/* gpages
cp README.md gpages
cd gpages
git config --local user.email "info@victoriametrics.com"
git config --local user.name "Vika"
git add "*.md"
git add .
git commit -m "update github pages"
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.github.io.git"
git push "${remote_repo}"
cd ..
rm -rf gpages
rm -rf gpages

View File

@@ -14,18 +14,19 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v1
uses: actions/setup-go@master
with:
go-version: 1.13
go-version: 1.14
id: go
- name: Code checkout
uses: actions/checkout@v1
- name: Dependencies
env:
GO111MODULE: off
GO111MODULE: on
run: |
go get -v golang.org/x/lint/golint
go get -u golang.org/x/lint/golint
go get -u github.com/kisielk/errcheck
go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
- name: Code checkout
uses: actions/checkout@master
- name: Build
env:
GO111MODULE: on
@@ -44,7 +45,7 @@ jobs:
GOOS=freebsd go build -mod=vendor ./app/victoria-metrics
GOOS=darwin go build -mod=vendor ./app/victoria-metrics
- name: Publish coverage
uses: codecov/codecov-action@v1.0.4
uses: codecov/codecov-action@v1.0.6
with:
token: ${{secrets.CODECOV_TOKEN}}
file: ./coverage.txt

View File

@@ -2,7 +2,7 @@ name: wiki
on:
push:
paths:
- 'docs/*.md'
- 'docs/*'
branches:
- master
jobs:
@@ -15,15 +15,14 @@ jobs:
env:
TOKEN: ${{secrets.CI_TOKEN}}
run: |
cd docs
git clone https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git wiki
find ./ -name '*.md' -exec cp -prv '{}' 'wiki' ';'
cp docs/* wiki
cd wiki
git config --local user.email "info@victoriametrics.com"
git config --local user.name "Vika"
git add "*.md"
git add .
git commit -m "update wiki pages"
remote_repo="https://vika:${TOKEN}@github.com/VictoriaMetrics/VictoriaMetrics.wiki.git"
git push "${remote_repo}"
cd ..
rm -rf wiki
rm -rf wiki

View File

@@ -13,6 +13,8 @@ GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date
all: \
victoria-metrics-prod \
vmagent-prod \
vmalert-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod
@@ -25,17 +27,23 @@ clean:
publish: \
publish-victoria-metrics \
publish-vmagent \
publish-vmalert \
publish-vmauth \
publish-vmbackup \
publish-vmrestore
package: \
package-victoria-metrics \
package-vmagent \
package-vmalert \
package-vmauth \
package-vmbackup \
package-vmrestore
vmutils: \
vmagent \
vmalert \
vmauth \
vmbackup \
vmrestore
@@ -49,9 +57,11 @@ release-victoria-metrics: victoria-metrics-prod
release-vmutils: \
vmagent-prod \
vmalert-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmbackup-prod vmrestore-prod && \
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmalert-prod vmauth-prod vmbackup-prod vmrestore-prod && \
sha256sum vmutils-$(PKG_TAG).tar.gz > vmutils-$(PKG_TAG)_checksums.txt
pprof-cpu:
@@ -78,9 +88,10 @@ errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./app/vmselect/...
errcheck -exclude=errcheck_excludes.txt ./app/vmstorage/...
errcheck -exclude=errcheck_excludes.txt ./app/vmagent/...
errcheck -exclude=errcheck_excludes.txt ./app/vmalert/...
errcheck -exclude=errcheck_excludes.txt ./app/vmauth/...
errcheck -exclude=errcheck_excludes.txt ./app/vmbackup/...
errcheck -exclude=errcheck_excludes.txt ./app/vmrestore/...
errcheck -exclude=errcheck_excludes.txt ./app/vmalert/...
install-errcheck:
which errcheck || GO111MODULE=off go get -u github.com/kisielk/errcheck
@@ -90,6 +101,9 @@ check-all: fmt vet lint errcheck golangci-lint
test:
GO111MODULE=on go test -mod=vendor ./lib/... ./app/...
test-race:
GO111MODULE=on go test -mod=vendor -race ./lib/... ./app/...
test-pure:
GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor ./lib/... ./app/...
@@ -127,7 +141,7 @@ install-qtc:
golangci-lint: install-golangci-lint
golangci-lint run --exclude '(SA4003|SA1019):' -D errcheck -D structcheck
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
install-golangci-lint:
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint

521
README.md

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,9 @@
victoria-metrics:
APP_NAME=victoria-metrics $(MAKE) app-local
victoria-metrics-race:
APP_NAME=victoria-metrics RACE=-race $(MAKE) app-local
victoria-metrics-prod:
APP_NAME=victoria-metrics $(MAKE) app-via-docker

View File

@@ -1,8 +1,8 @@
ARG certs_image
FROM $certs_image AS certs
FROM scratch
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG base_image
FROM $base_image
EXPOSE 8428
ENTRYPOINT ["/victoria-metrics-prod"]
ARG src_binary
COPY $src_binary ./victoria-metrics-prod
EXPOSE 8428
ENTRYPOINT ["/victoria-metrics-prod"]

View File

@@ -14,7 +14,11 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
)
var selfScrapeInterval = flag.Duration("selfScrapeInterval", 0, "Interval for self-scraping own metrics at /metrics page")
var (
selfScrapeInterval = flag.Duration("selfScrapeInterval", 0, "Interval for self-scraping own metrics at /metrics page")
selfScrapeInstance = flag.String("selfScrapeInstance", "self", "Value for 'instance' label, which is added to self-scraped metrics")
selfScrapeJob = flag.String("selfScrapeJob", "victoria-metrics", "Value for 'job' label, which is added to self-scraped metrics")
)
var selfScraperStopCh chan struct{}
var selfScraperWG sync.WaitGroup
@@ -65,8 +69,8 @@ func selfScraper(scrapeInterval time.Duration) {
r := &rows.Rows[i]
labels = labels[:0]
labels = addLabel(labels, "", r.Metric)
labels = addLabel(labels, "job", "victoria-metrics")
labels = addLabel(labels, "instance", "self")
labels = addLabel(labels, "job", *selfScrapeJob)
labels = addLabel(labels, "instance", *selfScrapeInstance)
for j := range r.Tags {
t := &r.Tags[j]
labels = addLabel(labels, t.Key, t.Value)

View File

@@ -0,0 +1,16 @@
{
"name": "empty-label-match",
"issue": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues/395",
"data": [
"empty_label_match 1 {TIME_S-1m}",
"empty_label_match;foo=bar 2 {TIME_S-1m}",
"empty_label_match;foo=baz 3 {TIME_S-1m}"],
"query": ["/api/v1/query_range?query=empty_label_match{foo=~'bar|'}&start={TIME_S}&end={TIME_S}&step=60"],
"result_query_range": {
"status":"success",
"data":{"resultType":"matrix",
"result":[
{"metric":{"__name__":"empty_label_match"},"values":[["{TIME_S}","1"]]},
{"metric":{"__name__":"empty_label_match","foo":"bar"},"values":[["{TIME_S}","2"]]}
]}}
}

View File

@@ -3,6 +3,9 @@
vmagent:
APP_NAME=vmagent $(MAKE) app-local
vmagent-race:
APP_NAME=vmagent RACE=-race $(MAKE) app-local
vmagent-prod:
APP_NAME=vmagent $(MAKE) app-via-docker

View File

@@ -1,7 +1,8 @@
## vmagent
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
<img alt="vmagent" src="vmagent.png">
@@ -18,17 +19,18 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
See [Quick Start](#quick-start) for details.
* Can add, remove and modify labels via Prometheus relabeling. See [these docs](#relabeling) for details.
* Can add, remove and modify labels (aka tags) via Prometheus relabeling. Can filter data before sending it to remote storage. See [these docs](#relabeling) for details.
* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-opentsdb-compatible-agents).
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data).
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-csv-data).
* Can replicate collected metrics simultaneously to multiple remote storage systems.
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered.
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
@@ -53,14 +55,67 @@ If you need collecting only Influx data, then the following command line would b
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
Then send Influx data to `http://vmagent-host:8429/write`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/).
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
### How to collect metrics in Prometheus format?
### Use cases
#### IoT and Edge monitoring
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to the remote storage.
It buffers the collected data in local files until the connection to remote storage becomes available and then sends the buffered
data to the remote storage. It re-tries sending the data to remote storage on any errors.
The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
`vmagent` works on various architectures from IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
#### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
#### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
#### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
#### Splitting data streams among multiple systems
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
#### Prometheus remote_write proxy
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
### How to collect metrics in Prometheus format
Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
@@ -68,18 +123,31 @@ sections from [Prometheus config file](https://prometheus.io/docs/prometheus/lat
* `global`
* `scrape_configs`
All the other section are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
Use `-remoteWrite.*` command-line flags instead for configuring remote write settings:
* `-remoteWrite.url` for pointing to remote storage. Data to remote storage can be sent either via HTTP or HTTPS. See `-remoteWrite.tls*` flags for details.
* `-remoteWrite.label` for adding labels to metrics before sending them to remote storage.
* `-remoteWrite.relabelConfig` for applying relabeling to metrics before sending them to remote storage.
All the other sections are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
Use `-remoteWrite.*` command-line flags instead for configuring remote write settings.
The following scrape types in [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) section are supported:
* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
`vmagent` doesn't support `role_arn` config param yet.
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
`vmagent` provides the following additional functionality for `gce_sd_config`:
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
* `consul_sd_configs` - for scraping targets registered in Consul.
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
@@ -97,15 +165,15 @@ Labels can be added to metrics via the following mechanisms:
`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
Additionally it provides the following extra actions:
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
The relabeling can be defined in the following places:
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to targets when parsing the file during `vmagent` startup
or during config reload after sending `SIGHUP` signal to `vmagent` via `kill -HUP`.
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to metrics after each scrape for the configured targets.
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to `-remoteWrite.url`.
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
Read more about relabeling in the following articles:
@@ -118,21 +186,26 @@ Read more about relabeling in the following articles:
### Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `/metrics` page. It is recommended setting up regular scraping of this page
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
`vmagent` also exports target statuses at `http://vmagent-host:8429/targets` page in plaintext format.
### Troubleshooting
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
since `vmagent` establishes at least a single TCP connection per each target.
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
and `vmagent_remotewrite_pending_data_bytes` metric exported by `vmagent` at `/metrics` page constantly grows.
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow big when remote storage is unvailable during extended periods of time. If you don't want
sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
### How to build from sources
@@ -142,7 +215,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent` from the root folder of the repository.
It builds `vmagent` binary and puts it into the `bin` folder.
@@ -157,3 +230,31 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmagent
```
### Profiling
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
* Memory profile. It can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
```
* CPU profile. It can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
```
The command for collecting CPU profile waits for 30 seconds before returning.
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).

View File

@@ -0,0 +1,63 @@
package csvimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="csvimport"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="csvimport"}`)
)
// InsertHandler processes csv data from req.
func InsertHandler(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
})
}
func insertRows(rows []parser.Row) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -1,8 +1,8 @@
ARG certs_image
FROM $certs_image AS certs
FROM scratch
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG base_image
FROM $base_image
EXPOSE 8429
ENTRYPOINT ["/vmagent-prod"]
ARG src_binary
COPY $src_binary ./vmagent-prod
EXPOSE 8429
ENTRYPOINT ["/vmagent-prod"]

View File

@@ -2,6 +2,7 @@ package influx
import (
"flag"
"io"
"net/http"
"runtime"
"sync"
@@ -25,12 +26,26 @@ var (
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="influx"}`)
)
// InsertHandler processes remote write for influx line protocol.
// InsertHandlerForReader processes remote write for influx line protocol.
//
// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
func InsertHandlerForReader(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, false, "", "", insertRows)
})
}
// InsertHandlerForHTTP processes remote write for influx line protocol.
//
// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
func InsertHandler(req *http.Request) error {
func InsertHandlerForHTTP(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
q := req.URL.Query()
precision := q.Get("precision")
// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
db := q.Get("db")
return parser.ParseStream(req.Body, isGzipped, precision, db, insertRows)
})
}

View File

@@ -7,6 +7,7 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
@@ -18,6 +19,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -28,7 +30,10 @@ import (
)
var (
httpListenAddr = flag.String("httpListenAddr", ":8429", "TCP address to listen for http connections")
httpListenAddr = flag.String("httpListenAddr", ":8429", "TCP address to listen for http connections. "+
"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty")
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
@@ -37,6 +42,7 @@ var (
)
var (
influxServer *influxserver.Server
graphiteServer *graphiteserver.Server
opentsdbServer *opentsdbserver.Server
opentsdbhttpServer *opentsdbhttpserver.Server
@@ -50,6 +56,9 @@ func main() {
startTime := time.Now()
remotewrite.Init()
writeconcurrencylimiter.Init()
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
}
if len(*graphiteListenAddr) > 0 {
graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
}
@@ -62,21 +71,28 @@ func main() {
promscrape.Init(remotewrite.Push)
go httpserver.Serve(*httpListenAddr, requestHandler)
if len(*httpListenAddr) > 0 {
go httpserver.Serve(*httpListenAddr, requestHandler)
}
logger.Infof("started vmagent in %.3f seconds", time.Since(startTime).Seconds())
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
startTime = time.Now()
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
if len(*httpListenAddr) > 0 {
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
promscrape.Stop()
if len(*influxListenAddr) > 0 {
influxServer.MustStop()
}
if len(*graphiteListenAddr) > 0 {
graphiteServer.MustStop()
}
@@ -103,10 +119,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusNoContent)
return true
case "/targets":
w.Header().Set("Content-Type", "text/plain")
promscrape.WriteHumanReadableTargetsStatus(w)
return true
case "/api/v1/import":
vmimportRequests.Inc()
if err := vmimport.InsertHandler(r); err != nil {
@@ -116,9 +128,18 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/csv":
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/write", "/api/v2/write":
influxWriteRequests.Inc()
if err := influx.InsertHandler(r); err != nil {
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
return true
@@ -131,19 +152,36 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
influxQueryRequests.Inc()
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
return true
case "/targets":
promscrapeTargetsRequests.Inc()
w.Header().Set("Content-Type", "text/plain")
promscrape.WriteHumanReadableTargetsStatus(w)
return true
case "/-/reload":
promscrapeConfigReloadRequests.Inc()
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
}
return false
}
var (
prometheusWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/write", protocol="prometheus"}`)
prometheusWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/write", protocol="prometheus"}`)
prometheusWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
prometheusWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/write", protocol="promremotewrite"}`)
vmimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import", protocol="vm"}`)
vmimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import", protocol="vm"}`)
vmimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import", protocol="vmimport"}`)
vmimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import", protocol="vmimport"}`)
csvimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/csv", protocol="csvimport"}`)
csvimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/csv", protocol="csvimport"}`)
influxWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/write", protocol="influx"}`)
influxWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/write", protocol="influx"}`)
influxQueryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/query", protocol="influx"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
)

View File

@@ -11,25 +11,31 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
"github.com/valyala/fasthttp"
)
var (
sendTimeout = flag.Duration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to -remoteWrite.url")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
tlsCertFile = flagutil.NewArray("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsKeyFile = flagutil.NewArray("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsCAFile = flagutil.NewArray("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username to use for -remoteWrite.url")
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password to use for -remoteWrite.url")
bearerToken = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url")
basicAuthUsername = flagutil.NewArray("remoteWrite.basicAuth.username", "Optional basic auth username to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
)
type client struct {
@@ -50,25 +56,29 @@ type client struct {
stopCh chan struct{}
}
func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue) *client {
func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
authHeader := ""
if len(*basicAuthUsername) > 0 || len(*basicAuthPassword) > 0 {
username := basicAuthUsername.GetOptionalArg(argIdx)
password := basicAuthPassword.GetOptionalArg(argIdx)
if len(username) > 0 || len(password) > 0 {
// See https://en.wikipedia.org/wiki/Basic_access_authentication
token := *basicAuthUsername + ":" + *basicAuthPassword
token := username + ":" + password
token64 := base64.StdEncoding.EncodeToString([]byte(token))
authHeader = "Basic " + token64
}
if len(*bearerToken) > 0 {
token := bearerToken.GetOptionalArg(argIdx)
if len(token) > 0 {
if authHeader != "" {
logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", *bearerToken)
logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
}
authHeader = "Bearer " + *bearerToken
authHeader = "Bearer " + token
}
readTimeout := *sendTimeout
if readTimeout <= 0 {
readTimeout = time.Minute
}
writeTimeout := readTimeout
var u fasthttp.URI
u.Update(remoteWriteURL)
scheme := string(u.Scheme())
@@ -86,7 +96,7 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
var tlsCfg *tls.Config
if isTLS {
var err error
tlsCfg, err = getTLSConfig()
tlsCfg, err = getTLSConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
}
@@ -98,7 +108,7 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
host += ":80"
}
}
maxConns := 2 * *queues
maxConns := 2 * concurrency
hc := &fasthttp.HostClient{
Addr: host,
Name: "vmagent",
@@ -109,7 +119,7 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
MaxConns: maxConns,
MaxIdleConnDuration: 10 * readTimeout,
ReadTimeout: readTimeout,
WriteTimeout: 10 * time.Second,
WriteTimeout: writeTimeout,
MaxResponseBodySize: 1024 * 1024,
}
c := &client{
@@ -122,11 +132,11 @@ func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQue
hc: hc,
stopCh: make(chan struct{}),
}
c.requestDuration = metrics.NewHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.urlLabelValue))
c.requestsOKCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.urlLabelValue))
c.errorsCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.urlLabelValue))
c.retriesCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.urlLabelValue))
for i := 0; i < *queues; i++ {
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.urlLabelValue))
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.urlLabelValue))
c.errorsCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.urlLabelValue))
c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.urlLabelValue))
for i := 0; i < concurrency; i++ {
c.wg.Add(1)
go func() {
defer c.wg.Done()
@@ -143,24 +153,26 @@ func (c *client) MustStop() {
logger.Infof("stopped client for -remoteWrite.url=%q", c.remoteWriteURL)
}
func getTLSConfig() (*tls.Config, error) {
func getTLSConfig(argIdx int) (*tls.Config, error) {
var tlsRootCA *x509.CertPool
var tlsCertificate *tls.Certificate
if *tlsCertFile != "" || *tlsKeyFile != "" {
cert, err := tls.LoadX509KeyPair(*tlsCertFile, *tlsKeyFile)
certFile := tlsCertFile.GetOptionalArg(argIdx)
keyFile := tlsKeyFile.GetOptionalArg(argIdx)
if certFile != "" || keyFile != "" {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", *tlsCertFile, *tlsKeyFile, err)
return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", certFile, keyFile, err)
}
tlsCertificate = &cert
}
if *tlsCAFile != "" {
data, err := ioutil.ReadFile(*tlsCAFile)
if caFile := tlsCAFile.GetOptionalArg(argIdx); caFile != "" {
data, err := ioutil.ReadFile(caFile)
if err != nil {
return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", *tlsCAFile, err)
return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", caFile, err)
}
tlsRootCA = x509.NewCertPool()
if !tlsRootCA.AppendCertsFromPEM(data) {
return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", *tlsCAFile)
return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", caFile)
}
}
tlsCfg := &tls.Config{
@@ -168,7 +180,9 @@ func getTLSConfig() (*tls.Config, error) {
ClientSessionCache: tls.NewLRUClientSessionCache(0),
}
if tlsCertificate != nil {
tlsCfg.Certificates = []tls.Certificate{*tlsCertificate}
tlsCfg.GetClientCertificate = func(*tls.CertificateRequestInfo) (*tls.Certificate, error) {
return tlsCertificate, nil
}
}
tlsCfg.InsecureSkipVerify = *tlsInsecureSkipVerify
return tlsCfg, nil
@@ -213,6 +227,7 @@ func (c *client) sendBlock(block []byte) {
req.Header.SetMethod("POST")
req.Header.Add("Content-Type", "application/x-protobuf")
req.Header.Add("Content-Encoding", "snappy")
req.Header.Add("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authHeader != "" {
req.Header.Set("Authorization", c.authHeader)
}
@@ -231,8 +246,7 @@ again:
}
startTime := time.Now()
// There is no need in calling DoTimeout, since the timeout is set in c.hc.ReadTimeout.
err := c.hc.Do(req, resp)
err := doRequestWithPossibleRetry(c.hc, req, resp)
c.requestDuration.UpdateDuration(startTime)
if err != nil {
c.errorsCount.Inc()
@@ -265,3 +279,16 @@ again:
fasthttp.ReleaseResponse(resp)
fasthttp.ReleaseRequest(req)
}
func doRequestWithPossibleRetry(hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response) error {
// There is no need in calling DoTimeout, since the timeout must be already set in hc.ReadTimeout.
err := hc.Do(req, resp)
if err == nil {
return nil
}
if err != fasthttp.ErrConnectionClosed {
return err
}
// Retry request if the server closed the keep-alive connection during the first attempt.
return hc.Do(req, resp)
}

View File

@@ -13,8 +13,12 @@ import (
"github.com/golang/snappy"
)
var flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
var (
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
)
// the maximum number of rows to send per each block.
const maxRowsPerBlock = 10000
@@ -160,17 +164,21 @@ func pushWriteRequest(wr *prompbmarshal.WriteRequest, pushBlock func(block []byt
}
bb := writeRequestBufPool.Get()
bb.B = prompbmarshal.MarshalWriteRequest(bb.B[:0], wr)
zb := snappyBufPool.Get()
zb.B = snappy.Encode(zb.B[:cap(zb.B)], bb.B)
writeRequestBufPool.Put(bb)
if len(zb.B) <= persistentqueue.MaxBlockSize {
pushBlock(zb.B)
blockSizeRows.Update(float64(len(wr.Timeseries)))
blockSizeBytes.Update(float64(len(zb.B)))
if len(bb.B) <= *maxUnpackedBlockSize {
zb := snappyBufPool.Get()
zb.B = snappy.Encode(zb.B[:cap(zb.B)], bb.B)
writeRequestBufPool.Put(bb)
if len(zb.B) <= persistentqueue.MaxBlockSize {
pushBlock(zb.B)
blockSizeRows.Update(float64(len(wr.Timeseries)))
blockSizeBytes.Update(float64(len(zb.B)))
snappyBufPool.Put(zb)
return
}
snappyBufPool.Put(zb)
return
} else {
writeRequestBufPool.Put(bb)
}
snappyBufPool.Put(zb)
// Too big block. Recursively split it into smaller parts.
timeseries := wr.Timeseries

View File

@@ -12,50 +12,46 @@ import (
)
var (
extraLabelsUnparsed = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
unparsedLabelsGlobal = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
relabelConfigPath = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
)
var extraLabels []prompbmarshal.Label
var prcs []promrelabel.ParsedRelabelConfig
var labelsGlobal []prompbmarshal.Label
var prcsGlobal []promrelabel.ParsedRelabelConfig
// initRelabel must be called after parsing command-line flags.
func initRelabel() {
// Init extraLabels
for _, s := range *extraLabelsUnparsed {
// initRelabelGlobal must be called after parsing command-line flags.
func initRelabelGlobal() {
// Init labelsGlobal
labelsGlobal = nil
for _, s := range *unparsedLabelsGlobal {
n := strings.IndexByte(s, '=')
if n < 0 {
logger.Panicf("FATAL: missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
}
extraLabels = append(extraLabels, prompbmarshal.Label{
labelsGlobal = append(labelsGlobal, prompbmarshal.Label{
Name: s[:n],
Value: s[n+1:],
})
}
// Init prcs
if len(*relabelConfigPath) > 0 {
// Init prcsGlobal
prcsGlobal = nil
if len(*relabelConfigPathGlobal) > 0 {
var err error
prcs, err = promrelabel.LoadRelabelConfigs(*relabelConfigPath)
prcsGlobal, err = promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
if err != nil {
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPath, err)
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
}
}
}
func resetRelabel() {
extraLabels = nil
prcs = nil
}
func (rctx *relabelCtx) applyRelabeling(wr *prompbmarshal.WriteRequest) {
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, prcs []promrelabel.ParsedRelabelConfig) []prompbmarshal.TimeSeries {
if len(extraLabels) == 0 && len(prcs) == 0 {
// Nothing to change.
return
return tss
}
tss := wr.Timeseries
tssDst := tss[:0]
labels := rctx.labels[:0]
for i := range tss {
@@ -83,7 +79,7 @@ func (rctx *relabelCtx) applyRelabeling(wr *prompbmarshal.WriteRequest) {
})
}
rctx.labels = labels
wr.Timeseries = tssDst
return tssDst
}
type relabelCtx struct {
@@ -106,3 +102,12 @@ var relabelCtxPool = &sync.Pool{
return &relabelCtx{}
},
}
func getRelabelCtx() *relabelCtx {
return relabelCtxPool.Get().(*relabelCtx)
}
func putRelabelCtx(rctx *relabelCtx) {
rctx.labels = rctx.labels[:0]
relabelCtxPool.Put(rctx)
}

View File

@@ -11,6 +11,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/metrics"
xxhash "github.com/cespare/xxhash/v2"
)
@@ -19,13 +20,20 @@ var (
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
"isn't enough for sending high volume of collected data to remote storage")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensistive auth info")
maxPendingBytesPerURL = flag.Int("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
"Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. "+
"Disk usage is unlimited if the value is set to 0")
)
var rwctxs []*remoteWriteCtx
// Init initializes remotewrite.
//
// It must be called after flag.Parse().
@@ -40,7 +48,7 @@ func Init() {
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
httpserver.RegisterSecretFlag("remoteWrite.url")
}
initRelabel()
initRelabelGlobal()
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
if maxInmemoryBlocks > 200 {
@@ -53,27 +61,16 @@ func Init() {
maxInmemoryBlocks = 2
}
for i, remoteWriteURL := range *remoteWriteURLs {
h := xxhash.Sum64([]byte(remoteWriteURL))
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks)
relabelConfigPath := ""
if i < len(*relabelConfigPaths) {
relabelConfigPath = (*relabelConfigPaths)[i]
}
urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
if *showRemoteWriteURL {
urlLabelValue = remoteWriteURL
}
_ = metrics.NewGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{url=%q, hash="%016X"}`, urlLabelValue, h), func() float64 {
return float64(fq.GetPendingBytes())
})
_ = metrics.NewGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{url=%q}`, urlLabelValue), func() float64 {
return float64(fq.GetInmemoryQueueLen())
})
c := newClient(remoteWriteURL, urlLabelValue, fq)
fqs = append(fqs, fq)
cs = append(cs, c)
}
pss = make([]*pendingSeries, *queues)
for i := range pss {
pss[i] = newPendingSeries(pushBlockToPersistentQueues)
rwctx := newRemoteWriteCtx(i, remoteWriteURL, relabelConfigPath, maxInmemoryBlocks, urlLabelValue)
rwctxs = append(rwctxs, rwctx)
}
}
@@ -81,47 +78,118 @@ func Init() {
//
// It is expected that nobody calls Push during and after the call to this func.
func Stop() {
for _, ps := range pss {
ps.MustStop()
for _, rwctx := range rwctxs {
rwctx.MustStop()
}
// Close all the persistent queues. This should unblock clients waiting in MustReadBlock.
for _, fq := range fqs {
fq.MustClose()
}
fqs = nil
// Stop all the clients
for _, c := range cs {
c.MustStop()
}
cs = nil
resetRelabel()
rwctxs = nil
}
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
//
// Each timeseries in wr.Timeseries must contain one sample.
func Push(wr *prompbmarshal.WriteRequest) {
rctx := relabelCtxPool.Get().(*relabelCtx)
rctx.applyRelabeling(wr)
idx := atomic.AddUint64(&pssNextIdx, 1) % uint64(len(pss))
pss[idx].Push(wr.Timeseries)
rctx.reset()
relabelCtxPool.Put(rctx)
}
func pushBlockToPersistentQueues(block []byte) {
for _, fq := range fqs {
fq.MustWriteBlock(block)
var rctx *relabelCtx
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
rctx = getRelabelCtx()
}
tss := wr.Timeseries
for len(tss) > 0 {
// Process big tss in smaller blocks in order to reduce maxmimum memory usage
tssBlock := tss
if len(tssBlock) > maxRowsPerBlock {
tssBlock = tss[:maxRowsPerBlock]
tss = tss[maxRowsPerBlock:]
} else {
tss = nil
}
if rctx != nil {
tssBlockLen := len(tssBlock)
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, prcsGlobal)
globalRelabelMetricsDropped.Add(tssBlockLen - len(tssBlock))
}
for _, rwctx := range rwctxs {
rwctx.Push(tssBlock)
}
if rctx != nil {
rctx.reset()
}
}
if rctx != nil {
putRelabelCtx(rctx)
}
}
var fqs []*persistentqueue.FastQueue
var cs []*client
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
var pssNextIdx uint64
var pss []*pendingSeries
type remoteWriteCtx struct {
fq *persistentqueue.FastQueue
c *client
prcs []promrelabel.ParsedRelabelConfig
pss []*pendingSeries
pssNextIdx uint64
relabelMetricsDropped *metrics.Counter
}
func newRemoteWriteCtx(argIdx int, remoteWriteURL, relabelConfigPath string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
h := xxhash.Sum64([]byte(remoteWriteURL))
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks, *maxPendingBytesPerURL)
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
return float64(fq.GetPendingBytes())
})
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
return float64(fq.GetInmemoryQueueLen())
})
c := newClient(argIdx, remoteWriteURL, urlLabelValue, fq, *queues)
var prcs []promrelabel.ParsedRelabelConfig
if len(relabelConfigPath) > 0 {
var err error
prcs, err = promrelabel.LoadRelabelConfigs(relabelConfigPath)
if err != nil {
logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", relabelConfigPath, err)
}
}
pss := make([]*pendingSeries, *queues)
for i := range pss {
pss[i] = newPendingSeries(fq.MustWriteBlock)
}
return &remoteWriteCtx{
fq: fq,
c: c,
prcs: prcs,
pss: pss,
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, urlLabelValue)),
}
}
func (rwctx *remoteWriteCtx) MustStop() {
for _, ps := range rwctx.pss {
ps.MustStop()
}
rwctx.pss = nil
rwctx.fq.MustClose()
rwctx.fq = nil
rwctx.prcs = nil
rwctx.c.MustStop()
rwctx.c = nil
rwctx.relabelMetricsDropped = nil
}
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
var rctx *relabelCtx
if len(rwctx.prcs) > 0 {
rctx = getRelabelCtx()
tssLen := len(tss)
tss = rctx.applyRelabeling(tss, nil, rwctx.prcs)
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
}
pss := rwctx.pss
idx := atomic.AddUint64(&rwctx.pssNextIdx, 1) % uint64(len(pss))
pss[idx].Push(tss)
if rctx != nil {
putRelabelCtx(rctx)
}
}

View File

@@ -4,8 +4,8 @@ import (
"net"
"sync/atomic"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
"github.com/valyala/fasthttp"
)
func statDial(addr string) (net.Conn, error) {

81
app/vmalert/Makefile Normal file
View File

@@ -0,0 +1,81 @@
# All these commands must run from repository root.
vmalert:
APP_NAME=vmalert $(MAKE) app-local
vmalert-race:
APP_NAME=vmalert RACE=-race $(MAKE) app-local
vmalert-prod:
APP_NAME=vmalert $(MAKE) app-via-docker
vmalert-pure-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-pure
vmalert-amd64-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-amd64
vmalert-arm-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-arm
vmalert-arm64-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-arm64
vmalert-ppc64le-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-ppc64le
vmalert-386-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-386
package-vmalert:
APP_NAME=vmalert $(MAKE) package-via-docker
package-vmalert-pure:
APP_NAME=vmalert $(MAKE) package-via-docker-pure
package-vmalert-amd64:
APP_NAME=vmalert $(MAKE) package-via-docker-amd64
package-vmalert-arm:
APP_NAME=vmalert $(MAKE) package-via-docker-arm
package-vmalert-arm64:
APP_NAME=vmalert $(MAKE) package-via-docker-arm64
package-vmalert-ppc64le:
APP_NAME=vmalert $(MAKE) package-via-docker-ppc64le
package-vmalert-386:
APP_NAME=vmalert $(MAKE) package-via-docker-386
publish-vmalert:
APP_NAME=vmalert $(MAKE) publish-via-docker
test-vmalert:
go test -race -cover ./app/vmalert
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 \
-remotewrite.url=http://localhost:8428 \
-remoteread.url=http://localhost:8428 \
-evaluationInterval=3s
vmalert-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-amd64 ./app/vmalert
vmalert-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm ./app/vmalert
vmalert-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm64 ./app/vmalert
vmalert-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-ppc64le ./app/vmalert
vmalert-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-386 ./app/vmalert
vmalert-pure:
APP_NAME=vmalert $(MAKE) app-local-pure

View File

@@ -1,41 +1,116 @@
## VM Alert
#### Abstract
The application which accepts the alert rules, executes them on given source, sends(fires) an alert to(in) alert management system
`vmalert` executes a list of given MetricsQL expressions (rules) and
sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
### Components
### Features:
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Lightweight without extra dependencies.
#### Alert Config Reader
It accepts yaml config as input parameter in Prometheus format, parses it into Go struct.
### TODO:
* Configuration hot reload.
#### Source Caller
Create own watchdog for every alert group (goroutines), which executes alert query on given source and issues an alert if source returns non-empty result.
Source can be any service which supports PromQL (MetricsQL).
### QuickStart
#### Alert Management System Provider
Send positive alert to alert management system, provides interface for every concrete implementation.
Should be ingratiated with Prometheus alertmanager.
To build `vmalert` from sources:
```
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
make vmalert
```
The build binary will be placed to `VictoriaMetrics/bin` folder.
open questions:
- do we really need alert group or can just run every alert in own goroutine?
To start using `vmalert` you will need the following things:
* list of alert rules - PromQL/MetricsQL expressions to execute;
* datasource address - reachable VictoriaMetrics instance for rules execution;
* notifier address - reachable Alertmanager instance for processing,
aggregating alerts and sending notifications.
#### Web Server
Expose metrics
Then configure `vmalert` accordingly:
```
./bin/vmalert -rule=alert.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093
```
open questions:
- should the tool provide API or UI for managing alerting rules? Where to store config updated via the API or UI?
- should the tool provide “alerting rules validation mode” for validating and debugging alerting rules? This mode is useful when creating and debugging alerting rules.
Example for `.rules` file may be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/testdata/rules0-good.rules)
#### Requirements:
- Stateless
- Avoid external dependencies if possible
- Reuse existing code from VictoriaMetrics repo
- Makefile rules for common tasks see Makefiles for other apps in the app/ dir
- Every package should be covered by tests
- Dockerfile
- Graceful shutdown
- Helm template
- Application uses command line flags for configuration
`vmalert` runs evaluation for every group in a separate goroutine.
Rules in group evaluated one-by-one sequentially.
`vmalert` also runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
<img alt="VM Alert" src="vmalert.png">
### Configuration
The shortlist of configuration flags is the following:
```
Usage of vmalert:
-datasource.basicAuth.password string
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-evaluationInterval duration
How often to evaluate the rules. Default 1m (default 1m0s)
-external.url string
External URL is used as alert's source for sent alerts to the notifier
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
-remoteread.basicAuth.password string
Optional basic auth password for -remoteread.url
-remoteread.basicAuth.username string
Optional basic auth username for -remoteread.url
-remoteread.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteread.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remotewrite.basicAuth.password string
Optional basic auth password for -remotewrite.url
-remotewrite.basicAuth.username string
Optional basic auth username for -remotewrite.url
-remotewrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
-rule.validateTemplates
Indicates to validate annotation and label templates (default true)
```
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
### Contributing
`vmalert` is mostly designed and built by VictoriaMetrics community.
Feel free to share your experience and ideas for improving this
software. Please keep simplicity as the main priority.

70
app/vmalert/config.go Normal file
View File

@@ -0,0 +1,70 @@
package main
import (
"fmt"
"gopkg.in/yaml.v2"
"io/ioutil"
"path/filepath"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
// Parse parses rule configs from given file patterns
func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
var fp []string
for _, pattern := range pathPatterns {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("error reading file patther %s:%v", pattern, err)
}
fp = append(fp, matches...)
}
var groups []Group
for _, file := range fp {
groupsNames := map[string]struct{}{}
gr, err := parseFile(file)
if err != nil {
return nil, fmt.Errorf("file %s: %w", file, err)
}
for _, group := range gr {
if _, ok := groupsNames[group.Name]; ok {
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", file, group.Name)
}
groupsNames[group.Name] = struct{}{}
for _, rule := range group.Rules {
if err = rule.Validate(); err != nil {
return nil, fmt.Errorf("invalid rule filepath:%s, group %s:%w", file, group.Name, err)
}
// TODO: this init looks weird here
rule.alerts = make(map[uint64]*notifier.Alert)
if validateAnnotations {
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
return nil, fmt.Errorf("invalid annotations filepath:%s, group %s:%w", file, group.Name, err)
}
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
return nil, fmt.Errorf("invalid labels filepath:%s, group %s:%w", file, group.Name, err)
}
}
rule.group = group
}
}
groups = append(groups, gr...)
}
if len(groups) < 1 {
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}
func parseFile(path string) ([]Group, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("error reading alert rule file: %w", err)
}
g := struct {
Groups []Group `yaml:"groups"`
}{}
err = yaml.Unmarshal(data, &g)
return g.Groups, err
}

View File

@@ -1,32 +0,0 @@
package config
import "time"
// Annotations basic annotation for alert rule
type Annotations struct {
Summary string
Description string
}
// Alert basic alert entity rule
type Alert struct {
Name string
Expr string
For time.Duration
Labels map[string]string
Annotations Annotations
Start time.Time
End time.Time
}
// Group grouping array of alert
type Group struct {
Name string
Rules []Alert
}
// Parse parses config from given file
func Parse(filepath string) ([]Group, error) {
return []Group{}, nil
}

View File

@@ -0,0 +1,39 @@
package main
import (
"net/url"
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
func TestParseGood(t *testing.T) {
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true); err != nil {
t.Errorf("error parsing files %s", err)
}
}
func TestParseBad(t *testing.T) {
if _, err := Parse([]string{"testdata/rules0-bad.rules"}, true); err == nil {
t.Errorf("expected syntaxt error")
}
if _, err := Parse([]string{"testdata/dir/rules0-bad.rules"}, true); err == nil {
t.Errorf("expected template annotation error")
}
if _, err := Parse([]string{"testdata/dir/rules1-bad.rules"}, true); err == nil {
t.Errorf("expected same group error")
}
if _, err := Parse([]string{"testdata/dir/rules2-bad.rules"}, true); err == nil {
t.Errorf("expected template label error")
}
if _, err := Parse([]string{"testdata/*.yaml"}, true); err == nil {
t.Errorf("expected empty group")
}
}

View File

@@ -2,13 +2,23 @@ package datasource
import "context"
// Metrics the data returns from storage
type Metrics struct{}
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct{}
//Query basic query to the datasource
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metrics, error) {
return nil, nil
// Querier interface wraps Query method which
// executes given query and returns list of Metrics
// as result
type Querier interface {
Query(ctx context.Context, query string) ([]Metric, error)
}
// Metric is the basic entity which should be return by datasource
// It represents single data point with full list of labels
type Metric struct {
Labels []Label
Timestamp int64
Value float64
}
// Label represents metric's label
type Label struct {
Name string
Value string
}

View File

@@ -0,0 +1,103 @@
package datasource
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strconv"
"strings"
)
type response struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result []struct {
Labels map[string]string `json:"metric"`
TV [2]interface{} `json:"value"`
} `json:"result"`
} `json:"data"`
ErrorType string `json:"errorType"`
Error string `json:"error"`
}
func (r response) metrics() ([]Metric, error) {
var ms []Metric
var m Metric
var f float64
var err error
for i, res := range r.Data.Result {
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
}
m.Labels = nil
for k, v := range r.Data.Result[i].Labels {
m.Labels = append(m.Labels, Label{Name: k, Value: v})
}
m.Timestamp = int64(res.TV[0].(float64))
m.Value = f
ms = append(ms, m)
}
return ms, nil
}
const queryPath = "/api/v1/query?query="
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct {
c *http.Client
queryURL string
basicAuthUser, basicAuthPass string
}
// NewVMStorage is a constructor for VMStorage
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, c *http.Client) *VMStorage {
return &VMStorage{
c: c,
basicAuthUser: basicAuthUser,
basicAuthPass: basicAuthPass,
queryURL: strings.TrimSuffix(baseURL, "/") + queryPath,
}
}
// Query reads metrics from datasource by given query
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
const (
statusSuccess, statusError, rtVector = "success", "error", "vector"
)
req, err := http.NewRequest("POST", s.queryURL+url.QueryEscape(query), nil)
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
if s.basicAuthPass != "" {
req.SetBasicAuth(s.basicAuthUser, s.basicAuthPass)
}
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
}
r := &response{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
}
if r.Data.ResultType != rtVector {
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
}
return r.metrics()
}

View File

@@ -0,0 +1,93 @@
package datasource
import (
"context"
"net/http"
"net/http/httptest"
"testing"
)
var (
ctx = context.Background()
basicAuthName = "foo"
basicAuthPass = "bar"
query = "vm_rows"
)
func TestVMSelectQuery(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
}
if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass {
t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass)
}
if r.URL.Query().Get("query") != query {
t.Errorf("exptected %s in query param, got %s", query, r.URL.Query().Get("query"))
}
switch c {
case 0:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 1:
w.WriteHeader(500)
case 2:
w.Write([]byte("[]"))
case 3:
w.Write([]byte(`{"status":"error", "errorType":"type:", "error":"some error msg"}`))
case 4:
w.Write([]byte(`{"status":"unknown"}`))
case 5:
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
case 6:
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]}]}}`))
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, srv.Client())
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected connection error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected invalid response status error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected response body error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected error status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected unkown status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected non-vector resultType error got nil")
}
m, err := am.Query(ctx, query)
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("exptected 1 metric got %d in %+v", len(m), m)
}
expected := Metric{
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
Timestamp: 1583786142,
Value: 13763,
}
if m[0].Timestamp != expected.Timestamp &&
m[0].Value != expected.Value &&
m[0].Labels[0].Value != expected.Labels[0].Value &&
m[0].Labels[0].Name != expected.Labels[0].Name {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
}

View File

@@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8880
ENTRYPOINT ["/vmalert-prod"]
ARG src_binary
COPY $src_binary ./vmalert-prod

View File

@@ -1,60 +1,239 @@
package main
import (
"context"
"flag"
"fmt"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
)
var (
configPath = flag.String("config", "config.yaml", "Path to alert configuration file")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.`)
validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" in form of timeseries. E.g. http://127.0.0.1:8428")
remoteWriteUsername = flag.String("remotewrite.basicAuth.username", "", "Optional basic auth username for -remotewrite.url")
remoteWritePassword = flag.String("remotewrite.basicAuth.password", "", "Optional basic auth password for -remotewrite.url")
remoteReadURL = flag.String("remoteread.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
remoteReadUsername = flag.String("remoteread.basicAuth.username", "", "Optional basic auth username for -remoteread.url")
remoteReadPassword = flag.String("remoteread.basicAuth.password", "", "Optional basic auth password for -remoteread.url")
remoteReadLookBack = flag.Duration("remoteread.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
)
// TODO: hot configuration reload
func main() {
envflag.Parse()
buildinfo.Init()
logger.Init()
logger.Infof("reading alert rules configuration file from %s", *configPath)
alertGroups, err := config.Parse(*configPath)
checkFlags()
ctx, cancel := context.WithCancel(context.Background())
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
logger.Fatalf("Cannot parse configuration file %s", err)
logger.Fatalf("can not get external url:%s ", err)
}
w := &watchdog{storage: &datasource.VMStorage{}}
for id := range alertGroups {
go func(group config.Group) {
w.run(group)
}(alertGroups[id])
notifier.InitTemplateFunc(eu)
logger.Infof("reading alert rules configuration file from %s", strings.Join(*rulePath, ";"))
groups, err := Parse(*rulePath, *validateTemplates)
if err != nil {
logger.Fatalf("cannot parse configuration file: %s", err)
}
go func() {
httpserver.Serve(*httpListenAddr, func(w http.ResponseWriter, r *http.Request) bool {
panic("not implemented")
w := &watchdog{
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
alertProvider: notifier.NewAlertManager(*notifierURL, func(group, name string) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, name)
}, &http.Client{}),
}
if *remoteWriteURL != "" {
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
Addr: *remoteWriteURL,
FlushInterval: *evaluationInterval,
BasicAuthUser: *remoteWriteUsername,
BasicAuthPass: *remoteWritePassword,
})
}()
if err != nil {
logger.Fatalf("failed to init remotewrite client: %s", err)
}
w.rw = c
}
var restoreDS *datasource.VMStorage
if *remoteReadURL != "" {
restoreDS = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
}
wg := sync.WaitGroup{}
for _, g := range groups {
if restoreDS != nil {
err := g.Restore(ctx, restoreDS, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", g.Name, err)
}
}
wg.Add(1)
go func(group Group) {
w.run(ctx, group, *evaluationInterval)
wg.Done()
}(g)
}
go httpserver.Serve(*httpListenAddr, (&requestHandler{groups: groups}).handler)
sig := procutil.WaitForSigterm()
logger.Infof("service received signal %s", sig)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
w.stop()
cancel()
if w.rw != nil {
err := w.rw.Close()
if err != nil {
logger.Fatalf("cannot stop the remotewrite: %s", err)
}
}
wg.Wait()
}
type watchdog struct {
storage *datasource.VMStorage
storage *datasource.VMStorage
alertProvider notifier.Notifier
rw *remotewrite.Client
}
func (w *watchdog) run(a config.Group) {
var (
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (w *watchdog) run(ctx context.Context, group Group, evaluationInterval time.Duration) {
logger.Infof("watchdog for %s has been started", group.Name)
t := time.NewTicker(evaluationInterval)
defer t.Stop()
for {
select {
case <-t.C:
iterationTotal.Inc()
iterationStart := time.Now()
for _, rule := range group.Rules {
execTotal.Inc()
execStart := time.Now()
err := rule.Exec(ctx, w.storage)
execDuration.UpdateDuration(execStart)
if err != nil {
execErrors.Inc()
logger.Errorf("failed to execute rule %q.%q: %s", group.Name, rule.Name, err)
continue
}
var alertsToSend []notifier.Alert
for _, a := range rule.alerts {
if a.State != notifier.StatePending {
alertsToSend = append(alertsToSend, *a)
}
if a.State == notifier.StateInactive || w.rw == nil {
continue
}
tss := rule.AlertToTimeSeries(a, execStart)
for _, ts := range tss {
remoteWriteSent.Inc()
if err := w.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
}
}
}
alertsSent.Add(len(alertsToSend))
if err := w.alertProvider.Send(alertsToSend); err != nil {
alertsSendErrors.Inc()
logger.Errorf("failed to send alert for rule %q.%q: %s", group.Name, rule.Name, err)
}
}
iterationDuration.UpdateDuration(iterationStart)
case <-ctx.Done():
logger.Infof("%s received stop signal", group.Name)
return
}
}
}
func (w *watchdog) stop() {
panic("not implemented")
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
if externalURL != "" {
return url.Parse(externalURL)
}
hname, err := os.Hostname()
if err != nil {
return nil, err
}
port := ""
if ipport := strings.Split(httpListenAddr, ":"); len(ipport) > 1 {
port = ":" + ipport[1]
}
schema := "http://"
if isSecure {
schema = "https://"
}
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
}
func checkFlags() {
if *notifierURL == "" {
flag.PrintDefaults()
logger.Fatalf("notifier.url is empty")
}
if *datasourceURL == "" {
flag.PrintDefaults()
logger.Fatalf("datasource.url is empty")
}
}

View File

@@ -0,0 +1,105 @@
package notifier
import (
"bytes"
"fmt"
"io"
"strings"
"text/template"
"time"
)
// Alert the triggered alert
// TODO: Looks like alert name isn't unique
type Alert struct {
Group string
Name string
Labels map[string]string
Annotations map[string]string
State AlertState
Start time.Time
End time.Time
Value float64
ID uint64
}
// AlertState type indicates the Alert state
type AlertState int
const (
// StateInactive is the state of an alert that is neither firing nor pending.
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
// String stringer for AlertState
func (as AlertState) String() string {
switch as {
case StateFiring:
return "firing"
case StatePending:
return "pending"
}
return "inactive"
}
type alertTplData struct {
Labels map[string]string
Value float64
}
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}`
// ExecTemplate executes the Alert template for give
// map of annotations.
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
tplData := alertTplData{Value: a.Value, Labels: a.Labels}
return templateAnnotations(annotations, tplHeader, tplData)
}
// ValidateTemplates validate annotations for possible template error, uses empty data for template population
func ValidateTemplates(annotations map[string]string) error {
_, err := templateAnnotations(annotations, tplHeader, alertTplData{
Labels: map[string]string{},
Value: 0,
})
return err
}
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
var builder strings.Builder
var buf bytes.Buffer
eg := errGroup{}
r := make(map[string]string, len(annotations))
for key, text := range annotations {
r[key] = text
buf.Reset()
builder.Reset()
builder.Grow(len(header) + len(text))
builder.WriteString(header)
builder.WriteString(text)
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
eg.errs = append(eg.errs, fmt.Sprintf("key %s, template %s:%s", key, text, err))
continue
}
r[key] = buf.String()
}
return r, eg.err()
}
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation:%w", err)
}
if err = tpl.Execute(dst, data); err != nil {
return fmt.Errorf("error evaluating annotation template:%w", err)
}
return nil
}

View File

@@ -0,0 +1,65 @@
package notifier
import (
"fmt"
"testing"
)
func TestAlert_ExecTemplate(t *testing.T) {
testCases := []struct {
alert *Alert
annotations map[string]string
expTpl map[string]string
}{
{
alert: &Alert{},
annotations: map[string]string{},
expTpl: map[string]string{},
},
{
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
"instance": "localhost",
},
},
annotations: map[string]string{},
expTpl: map[string]string{},
},
{
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
"job": "staging",
"instance": "localhost",
},
},
annotations: map[string]string{
"summary": "Too high connection number for {{$labels.instance}} for job {{$labels.job}}",
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
expTpl: map[string]string{
"summary": "Too high connection number for localhost for job staging",
"description": "It is 10000 connections for localhost",
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
tpl, err := tc.alert.ExecTemplate(tc.annotations)
if err != nil {
t.Fatal(err)
}
if len(tpl) != len(tc.expTpl) {
t.Fatalf("expected %d elements; got %d", len(tc.expTpl), len(tpl))
}
for k := range tc.expTpl {
got, exp := tpl[k], tc.expTpl[k]
if got != exp {
t.Fatalf("expected %q=%q; got %q=%q", k, exp, k, got)
}
}
})
}
}

View File

@@ -0,0 +1,51 @@
package notifier
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
"strings"
)
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
alertURL string
argFunc AlertURLGenerator
client *http.Client
}
// Send an alert or resolve message
func (am *AlertManager) Send(alerts []Alert) error {
b := &bytes.Buffer{}
writeamRequest(b, alerts, am.argFunc)
resp, err := am.client.Post(am.alertURL, "application/json", b)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response from %q: %s", am.alertURL, err)
}
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
}
return nil
}
// AlertURLGenerator returns URL to single alert by given name
type AlertURLGenerator func(group, id string) string
const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
return &AlertManager{
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath,
argFunc: fn,
client: c,
}
}

View File

@@ -0,0 +1,34 @@
{% import (
"strconv"
"time"
) %}
{% stripspace %}
{% func amRequest(alerts []Alert, generatorURL func(string, string) string) %}
[
{% for i, alert := range alerts %}
{
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)) %},
{% if !alert.End.IsZero() %}
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
{% endif %}
"labels": {
"alertname":{%q= alert.Name %}
{% for k,v := range alert.Labels %}
,{%q= k %}:{%q= v %}
{% endfor %}
},
"annotations": {
{% code c := len(alert.Annotations) %}
{% for k,v := range alert.Annotations %}
{% code c = c-1 %}
{%q= k %}:{%q= v %}{% if c > 0 %},{% endif %}
{% endfor %}
}
}
{% if i != len(alerts)-1 %},{% endif %}
{% endfor %}
]
{% endfunc %}
{% endstripspace %}

View File

@@ -0,0 +1,131 @@
// Code generated by qtc from "alertmanager_request.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vmalert/notifier/alertmanager_request.qtpl:1
package notifier
//line app/vmalert/notifier/alertmanager_request.qtpl:1
import (
"strconv"
"time"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmalert/notifier/alertmanager_request.qtpl:7
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:7
qw422016.N().S(`[`)
//line app/vmalert/notifier/alertmanager_request.qtpl:9
for i, alert := range alerts {
//line app/vmalert/notifier/alertmanager_request.qtpl:9
qw422016.N().S(`{"startsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().S(`,"generatorURL":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().Q(generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)))
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:13
if !alert.End.IsZero() {
//line app/vmalert/notifier/alertmanager_request.qtpl:13
qw422016.N().S(`"endsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:14
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:14
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:15
}
//line app/vmalert/notifier/alertmanager_request.qtpl:15
qw422016.N().S(`"labels": {"alertname":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:17
qw422016.N().Q(alert.Name)
//line app/vmalert/notifier/alertmanager_request.qtpl:18
for k, v := range alert.Labels {
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:20
}
//line app/vmalert/notifier/alertmanager_request.qtpl:20
qw422016.N().S(`},"annotations": {`)
//line app/vmalert/notifier/alertmanager_request.qtpl:23
c := len(alert.Annotations)
//line app/vmalert/notifier/alertmanager_request.qtpl:24
for k, v := range alert.Annotations {
//line app/vmalert/notifier/alertmanager_request.qtpl:25
c = c - 1
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
if c > 0 {
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:26
}
//line app/vmalert/notifier/alertmanager_request.qtpl:27
}
//line app/vmalert/notifier/alertmanager_request.qtpl:27
qw422016.N().S(`}}`)
//line app/vmalert/notifier/alertmanager_request.qtpl:30
if i != len(alerts)-1 {
//line app/vmalert/notifier/alertmanager_request.qtpl:30
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:30
}
//line app/vmalert/notifier/alertmanager_request.qtpl:31
}
//line app/vmalert/notifier/alertmanager_request.qtpl:31
qw422016.N().S(`]`)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
}
//line app/vmalert/notifier/alertmanager_request.qtpl:33
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(string, string) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:33
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
streamamRequest(qw422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
}
//line app/vmalert/notifier/alertmanager_request.qtpl:33
func amRequest(alerts []Alert, generatorURL func(string, string) string) string {
//line app/vmalert/notifier/alertmanager_request.qtpl:33
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/notifier/alertmanager_request.qtpl:33
writeamRequest(qb422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
qs422016 := string(qb422016.B)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:33
return qs422016
//line app/vmalert/notifier/alertmanager_request.qtpl:33
}

View File

@@ -1,4 +1,4 @@
package provider
package notifier
import (
"encoding/json"
@@ -6,8 +6,6 @@ import (
"net/http/httptest"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
)
func TestAlertManager_Send(t *testing.T) {
@@ -16,7 +14,7 @@ func TestAlertManager_Send(t *testing.T) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc(alertsPath, func(w http.ResponseWriter, r *http.Request) {
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
@@ -42,7 +40,7 @@ func TestAlertManager_Send(t *testing.T) {
if len(a) != 1 {
t.Errorf("expected 1 alert in array got %d", len(a))
}
if a[0].GeneratorURL != "alert0" {
if a[0].GeneratorURL != "group0" {
t.Errorf("exptected alert0 as generatorURL got %s", a[0].GeneratorURL)
}
if a[0].Labels["alertname"] != "alert0" {
@@ -58,20 +56,22 @@ func TestAlertManager_Send(t *testing.T) {
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewAlertManager(srv.URL, func(name string) string {
return name
am := NewAlertManager(srv.URL, func(group, name string) string {
return group + name
}, srv.Client())
if err := am.Send(&config.Alert{}); err == nil {
if err := am.Send([]Alert{{}, {}}); err == nil {
t.Error("expected connection error got nil")
}
if err := am.Send(&config.Alert{}); err == nil {
if err := am.Send([]Alert{}); err == nil {
t.Error("expected wrong http code error got nil")
}
if err := am.Send(&config.Alert{
Name: "alert0",
Start: time.Now().UTC(),
End: time.Now().UTC(),
}); err != nil {
if err := am.Send([]Alert{{
Group: "group",
Name: "alert0",
Start: time.Now().UTC(),
End: time.Now().UTC(),
Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
}}); err != nil {
t.Errorf("unexpected error %s", err)
}
if c != 2 {

View File

@@ -0,0 +1,6 @@
package notifier
// Notifier is common interface for alert manager provider
type Notifier interface {
Send(alerts []Alert) error
}

View File

@@ -0,0 +1,171 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package notifier
import (
"fmt"
html_template "html/template"
"math"
"net/url"
"regexp"
"strings"
text_template "text/template"
"time"
)
var tmplFunc text_template.FuncMap
// InitTemplateFunc returns template helper functions
func InitTemplateFunc(externalURL *url.URL) {
tmplFunc = text_template.FuncMap{
"args": func(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
result[fmt.Sprintf("arg%d", i)] = a
}
return result
},
"reReplaceAll": func(pattern, repl, text string) string {
re := regexp.MustCompile(pattern)
return re.ReplaceAllString(text, repl)
},
"safeHtml": func(text string) html_template.HTML {
return html_template.HTML(text)
},
"match": regexp.MatchString,
"title": strings.Title,
"toUpper": strings.ToUpper,
"toLower": strings.ToLower,
"humanize": func(v float64) string {
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
if math.Abs(v) >= 1 {
prefix := ""
for _, p := range []string{"k", "M", "G", "T", "P", "E", "Z", "Y"} {
if math.Abs(v) < 1000 {
break
}
prefix = p
v /= 1000
}
return fmt.Sprintf("%.4g%s", v, prefix)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
"humanize1024": func(v float64) string {
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
prefix := ""
for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} {
if math.Abs(v) < 1024 {
break
}
prefix = p
v /= 1024
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
"humanizeDuration": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
if v == 0 {
return fmt.Sprintf("%.4gs", v)
}
if math.Abs(v) >= 1 {
sign := ""
if v < 0 {
sign = "-"
v = -v
}
seconds := int64(v) % 60
minutes := (int64(v) / 60) % 60
hours := (int64(v) / 60 / 60) % 24
days := int64(v) / 60 / 60 / 24
// For days to minutes, we display seconds as an integer.
if days != 0 {
return fmt.Sprintf("%s%dd %dh %dm %ds", sign, days, hours, minutes, seconds)
}
if hours != 0 {
return fmt.Sprintf("%s%dh %dm %ds", sign, hours, minutes, seconds)
}
if minutes != 0 {
return fmt.Sprintf("%s%dm %ds", sign, minutes, seconds)
}
// For seconds, we display 4 significant digits.
return fmt.Sprintf("%s%.4gs", sign, v)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.4g%ss", v, prefix)
},
"humanizePercentage": func(v float64) string {
return fmt.Sprintf("%.4g%%", v*100)
},
"humanizeTimestamp": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
t := TimeFromUnixNano(int64(v * 1e9)).Time().UTC()
return fmt.Sprint(t)
},
"pathPrefix": func() string {
return externalURL.Path
},
"externalURL": func() string {
return externalURL.String()
},
}
}
// Time is the number of milliseconds since the epoch
// (1970-01-01 00:00 UTC) excluding leap seconds.
type Time int64
// TimeFromUnixNano returns the Time equivalent to the Unix Time
// t provided in nanoseconds.
func TimeFromUnixNano(t int64) Time {
return Time(t / nanosPerTick)
}
// The number of nanoseconds per minimum tick.
const nanosPerTick = int64(minimumTick / time.Nanosecond)
// MinimumTick is the minimum supported time resolution. This has to be
// at least time.Second in order for the code below to work.
const minimumTick = time.Millisecond
// second is the Time duration equivalent to one second.
const second = int64(time.Second / minimumTick)
// Time returns the time.Time representation of t.
func (t Time) Time() time.Time {
return time.Unix(int64(t)/second, (int64(t)%second)*nanosPerTick)
}

View File

@@ -0,0 +1,21 @@
package notifier
import (
"fmt"
"strings"
)
type errGroup struct {
errs []string
}
func (eg *errGroup) err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
func (eg *errGroup) Error() string {
return fmt.Sprintf("errors:%s", strings.Join(eg.errs, "\n"))
}

View File

@@ -1,26 +0,0 @@
{% import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
) %}
{% stripspace %}
{% func amRequest(alert *config.Alert, generatorURL string) %}
{
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL %},
{% if !alert.End.IsZero() %}
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
{% endif %}
"labels": {
"alertname":{%q= alert.Name %}
{% for k,v := range alert.Labels %}
,{%q= k %}:{%q= v %}
{% endfor %}
},
"annotations": {
"summary": {%q= alert.Annotations.Summary %},
"description": {%q= alert.Annotations.Description %}
}
}
{% endfunc %}
{% endstripspace %}

View File

@@ -1,101 +0,0 @@
// Code generated by qtc from "alert_manager_request.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vmalert/provider/alert_manager_request.qtpl:1
package provider
//line app/vmalert/provider/alert_manager_request.qtpl:1
import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"time"
)
//line app/vmalert/provider/alert_manager_request.qtpl:7
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmalert/provider/alert_manager_request.qtpl:7
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmalert/provider/alert_manager_request.qtpl:7
func streamamRequest(qw422016 *qt422016.Writer, alert *config.Alert, generatorURL string) {
//line app/vmalert/provider/alert_manager_request.qtpl:7
qw422016.N().S(`{"startsAt":`)
//line app/vmalert/provider/alert_manager_request.qtpl:9
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
//line app/vmalert/provider/alert_manager_request.qtpl:9
qw422016.N().S(`,"generatorURL":`)
//line app/vmalert/provider/alert_manager_request.qtpl:10
qw422016.N().Q(generatorURL)
//line app/vmalert/provider/alert_manager_request.qtpl:10
qw422016.N().S(`,`)
//line app/vmalert/provider/alert_manager_request.qtpl:11
if !alert.End.IsZero() {
//line app/vmalert/provider/alert_manager_request.qtpl:11
qw422016.N().S(`"endsAt":`)
//line app/vmalert/provider/alert_manager_request.qtpl:12
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
//line app/vmalert/provider/alert_manager_request.qtpl:12
qw422016.N().S(`,`)
//line app/vmalert/provider/alert_manager_request.qtpl:13
}
//line app/vmalert/provider/alert_manager_request.qtpl:13
qw422016.N().S(`"labels": {"alertname":`)
//line app/vmalert/provider/alert_manager_request.qtpl:15
qw422016.N().Q(alert.Name)
//line app/vmalert/provider/alert_manager_request.qtpl:16
for k, v := range alert.Labels {
//line app/vmalert/provider/alert_manager_request.qtpl:16
qw422016.N().S(`,`)
//line app/vmalert/provider/alert_manager_request.qtpl:17
qw422016.N().Q(k)
//line app/vmalert/provider/alert_manager_request.qtpl:17
qw422016.N().S(`:`)
//line app/vmalert/provider/alert_manager_request.qtpl:17
qw422016.N().Q(v)
//line app/vmalert/provider/alert_manager_request.qtpl:18
}
//line app/vmalert/provider/alert_manager_request.qtpl:18
qw422016.N().S(`},"annotations": {"summary":`)
//line app/vmalert/provider/alert_manager_request.qtpl:21
qw422016.N().Q(alert.Annotations.Summary)
//line app/vmalert/provider/alert_manager_request.qtpl:21
qw422016.N().S(`,"description":`)
//line app/vmalert/provider/alert_manager_request.qtpl:22
qw422016.N().Q(alert.Annotations.Description)
//line app/vmalert/provider/alert_manager_request.qtpl:22
qw422016.N().S(`}}`)
//line app/vmalert/provider/alert_manager_request.qtpl:25
}
//line app/vmalert/provider/alert_manager_request.qtpl:25
func writeamRequest(qq422016 qtio422016.Writer, alert *config.Alert, generatorURL string) {
//line app/vmalert/provider/alert_manager_request.qtpl:25
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/provider/alert_manager_request.qtpl:25
streamamRequest(qw422016, alert, generatorURL)
//line app/vmalert/provider/alert_manager_request.qtpl:25
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/provider/alert_manager_request.qtpl:25
}
//line app/vmalert/provider/alert_manager_request.qtpl:25
func amRequest(alert *config.Alert, generatorURL string) string {
//line app/vmalert/provider/alert_manager_request.qtpl:25
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/provider/alert_manager_request.qtpl:25
writeamRequest(qb422016, alert, generatorURL)
//line app/vmalert/provider/alert_manager_request.qtpl:25
qs422016 := string(qb422016.B)
//line app/vmalert/provider/alert_manager_request.qtpl:25
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/provider/alert_manager_request.qtpl:25
return qs422016
//line app/vmalert/provider/alert_manager_request.qtpl:25
}

View File

@@ -1,66 +0,0 @@
package provider
import (
"bytes"
"fmt"
"io"
"net/http"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
const alertsPath = "/api/v2/alerts"
var pool = sync.Pool{New: func() interface{} {
return &bytes.Buffer{}
}}
// AlertManager represents integration provider with Prometheus alert manager
type AlertManager struct {
alertURL string
argFunc AlertURLGenerator
client *http.Client
}
// AlertURLGenerator returns URL to single alert by given name
type AlertURLGenerator func(name string) string
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
return &AlertManager{
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertsPath,
argFunc: fn,
client: c,
}
}
const (
jsonArrayOpen byte = 91 // [
jsonArrayClose byte = 93 // ]
)
// Send an alert or resolve message
func (am *AlertManager) Send(alert *config.Alert) error {
b := pool.Get().(*bytes.Buffer)
b.Reset()
defer pool.Put(b)
b.WriteByte(jsonArrayOpen)
writeamRequest(b, alert, am.argFunc(alert.Name))
b.WriteByte(jsonArrayClose)
resp, err := am.client.Post(am.alertURL, "application/json", b)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
b.Reset()
if _, err := io.Copy(b, resp.Body); err != nil {
logger.Errorf("unable to copy error response body to buffer %s", err)
}
return fmt.Errorf("invalid response from alertmanager %s", b)
}
return nil
}

View File

@@ -1,8 +0,0 @@
package provider
import "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
// AlertProvider is common interface for alert manager provider
type AlertProvider interface {
Send(rule config.Alert) error
}

View File

@@ -0,0 +1,188 @@
package remotewrite
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/golang/snappy"
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
input chan prompbmarshal.TimeSeries
baUser, baPass string
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
BasicAuthUser string
BasicAuthPass string
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// WriteTimeout defines timeout for HTTP write request
// to remote storage
WriteTimeout time.Duration
}
const (
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 100
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
const writePath = "/api/v1/write"
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.WriteTimeout == 0 {
cfg.WriteTimeout = defaultWriteTimeout
}
c := &Client{
c: &http.Client{
Timeout: cfg.WriteTimeout,
},
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
baUser: cfg.BasicAuthUser,
baPass: cfg.BasicAuthPass,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
c.run(ctx)
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries)",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), time.Second*10)
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
wr = prompbmarshal.WriteRequest{}
case ts := <-c.input:
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
wr = prompbmarshal.WriteRequest{}
}
}
}
}()
}
func (c *Client) flush(ctx context.Context, wr prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
req, err := http.NewRequest("POST", c.addr, bytes.NewReader(snappy.Encode(nil, data)))
if err != nil {
logger.Errorf("failed to create new HTTP request: %s", err)
return
}
if c.baPass != "" {
req.SetBasicAuth(c.baUser, c.baPass)
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
logger.Errorf("error getting response from %s:%s", req.URL, err)
return
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusNoContent {
body, _ := ioutil.ReadAll(resp.Body)
logger.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
return
}
}

340
app/vmalert/rule.go Normal file
View File

@@ -0,0 +1,340 @@
package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
"strconv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metricsql"
)
// Group grouping array of alert
type Group struct {
Name string
Rules []*Rule
}
// Restore restores alerts state for all group rules with For > 0
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
for _, rule := range g.Rules {
if rule.For == 0 {
return nil
}
if err := rule.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
}
}
return nil
}
// Rule is basic alert entity
type Rule struct {
Name string `yaml:"alert"`
Expr string `yaml:"expr"`
For time.Duration `yaml:"for"`
Labels map[string]string `yaml:"labels"`
Annotations map[string]string `yaml:"annotations"`
group Group
// guard status fields
mu sync.RWMutex
// stores list of active alerts
alerts map[uint64]*notifier.Alert
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
}
// Validate validates rule
func (r *Rule) Validate() error {
if r.Name == "" {
return errors.New("rule name can not be empty")
}
if r.Expr == "" {
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
}
if _, err := metricsql.Parse(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
}
return nil
}
// Exec executes Rule expression via the given Querier.
// Based on the Querier results Rule maintains notifier.Alerts
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
qMetrics, err := q.Query(ctx, r.Expr)
r.mu.Lock()
defer r.mu.Unlock()
r.lastExecError = err
r.lastExecTime = time.Now()
if err != nil {
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
}
for h, a := range r.alerts {
// cleanup inactive alerts from previous Eval
if a.State == notifier.StateInactive {
delete(r.alerts, h)
}
}
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
h := hash(m)
updated[h] = struct{}{}
if a, ok := r.alerts[h]; ok {
// update Value field with latest value
a.Value = m.Value
continue
}
a, err := r.newAlert(m)
if err != nil {
r.lastExecError = err
return fmt.Errorf("failed to create alert: %s", err)
}
a.ID = h
a.State = notifier.StatePending
r.alerts[h] = a
}
for h, a := range r.alerts {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
a.State = notifier.StateInactive
// set endTime to last execution time
// so it can be sent by notifier on next step
a.End = r.lastExecTime
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
if a.State == notifier.StateFiring {
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
}
}
return nil
}
// TODO: consider hashing algorithm in VM
func hash(m datasource.Metric) uint64 {
hash := fnv.New64a()
labels := m.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
a := &notifier.Alert{
Group: r.group.Name,
Name: r.Name,
Labels: map[string]string{},
Value: m.Value,
Start: time.Now(),
// TODO: support End time
}
// 1. use data labels
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
a.Labels[l.Name] = l.Value
}
// 2. template rule labels with data labels
rLabels, err := a.ExecTemplate(r.Labels)
if err != nil {
return a, err
}
// 3. merge data labels and rule labels
// metric labels may be overridden by
// rule labels
for k, v := range rLabels {
a.Labels[k] = v
}
// 4. template merged labels
a.Labels, err = a.ExecTemplate(a.Labels)
if err != nil {
return a, err
}
a.Annotations, err = a.ExecTemplate(r.Annotations)
return a, err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (r *Rule) AlertAPI(id uint64) *APIAlert {
r.mu.RLock()
defer r.mu.RUnlock()
a, ok := r.alerts[id]
if !ok {
return nil
}
return r.newAlertAPI(*a)
}
// AlertsAPI generates list of APIAlert objects from existing alerts
func (r *Rule) AlertsAPI() []*APIAlert {
var alerts []*APIAlert
r.mu.RLock()
for _, a := range r.alerts {
alerts = append(alerts, r.newAlertAPI(*a))
}
r.mu.RUnlock()
return alerts
}
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
return &APIAlert{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
Name: a.Name,
Group: a.Group,
Expression: r.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.Start,
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
}
}
const (
// AlertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
// AlertForStateMetricName is the metric name for 'for' state of alert.
alertForStateMetricName = "ALERTS_FOR_STATE"
// AlertNameLabel is the label name indicating the name of an alert.
alertNameLabel = "alertname"
// AlertStateLabel is the label name indicating the state of an alert.
alertStateLabel = "alertstate"
)
// AlertToTimeSeries converts the given alert with the given timestamp to timeseries
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
if r.For > 0 {
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
}
return tss
}
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertMetricName
labels[alertNameLabel] = name
labels[alertStateLabel] = a.State.String()
return newTimeSeries(1, labels, timestamp)
}
// alertForToTimeSeries returns a timeseries that represents
// state of active alerts, where value is time when alert become active
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertForStateMetricName
labels[alertNameLabel] = name
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{}
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamp.UnixNano() / 1e6,
})
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, key := range keys {
ts.Labels = append(ts.Labels, prompbmarshal.Label{
Name: key,
Value: labels[key],
})
}
return ts
}
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Eval, as well as Value field.
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
// Get the last datapoint in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
alertForStateMetricName, r.Name, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
}
for _, m := range qMetrics {
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Eval
for _, l := range labels {
if l.Name == alertNameLabel {
continue
}
// drop all overridden labels
if _, ok := r.Labels[l.Name]; ok {
continue
}
m.Labels = append(m.Labels, l)
}
a, err := r.newAlert(m)
if err != nil {
return fmt.Errorf("failed to create alert: %s", err)
}
a.ID = hash(m)
a.State = notifier.StatePending
a.Start = time.Unix(int64(m.Value), 0)
r.alerts[a.ID] = a
logger.Infof("alert %q.%q restored to state at %v", a.Group, a.Name, a.Start)
}
return nil
}

535
app/vmalert/rule_test.go Normal file
View File

@@ -0,0 +1,535 @@
package main
import (
"context"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestRule_Validate(t *testing.T) {
if err := (&Rule{}).Validate(); err == nil {
t.Errorf("exptected empty name error")
}
if err := (&Rule{Name: "alert"}).Validate(); err == nil {
t.Errorf("exptected empty expr error")
}
if err := (&Rule{Name: "alert", Expr: "test{"}).Validate(); err == nil {
t.Errorf("exptected invalid expr error")
}
if err := (&Rule{Name: "alert", Expr: "test>0"}).Validate(); err != nil {
t.Errorf("exptected valid rule got %s", err)
}
}
func TestRule_AlertToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *Rule
alert *notifier.Alert
expTS []prompbmarshal.TimeSeries
}{
{
newTestRule("instant", 0),
&notifier.Alert{State: notifier.StateFiring},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant",
}, timestamp),
},
},
{
newTestRule("instant extra labels", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
"job": "foo",
"instance": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant extra labels",
"job": "foo",
"instance": "bar",
}, timestamp),
},
},
{
newTestRule("instant labels override", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
alertStateLabel: "foo",
"__name__": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant labels override",
}, timestamp),
},
},
{
newTestRule("for", time.Second),
&notifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "for",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for",
}, timestamp),
},
},
{
newTestRule("for pending", 10*time.Second),
&notifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StatePending.String(),
alertNameLabel: "for pending",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for pending",
}, timestamp),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
tss := tc.rule.AlertToTimeSeries(tc.alert, timestamp)
if len(tc.expTS) != len(tss) {
t.Fatalf("expected number of timeseries %d; got %d", len(tc.expTS), len(tss))
}
for i := range tc.expTS {
expTS, gotTS := tc.expTS[i], tss[i]
if len(expTS.Samples) != len(gotTS.Samples) {
t.Fatalf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
}
for i, exp := range expTS.Samples {
got := gotTS.Samples[i]
if got.Value != exp.Value {
t.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
}
if got.Timestamp != exp.Timestamp {
t.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
}
}
if len(expTS.Labels) != len(gotTS.Labels) {
t.Fatalf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
}
for i, exp := range expTS.Labels {
got := gotTS.Labels[i]
if got.Name != exp.Name {
t.Errorf("expected label name %q; got %q", exp.Name, got.Name)
}
if got.Value != exp.Value {
t.Errorf("expected label value %q; got %q", exp.Value, got.Value)
}
}
}
})
}
}
func newTestRule(name string, waitFor time.Duration) *Rule {
return &Rule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
}
func TestRule_Exec(t *testing.T) {
testCases := []struct {
rule *Rule
steps [][]datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestRule("empty", 0),
[][]datasource.Metric{},
map[uint64]*notifier.Alert{},
},
{
newTestRule("empty labels", 0),
[][]datasource.Metric{
{datasource.Metric{}},
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StateFiring},
},
},
{
newTestRule("single-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestRule("single-firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestRule("single-firing=>inactive=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
},
map[uint64]*notifier.Alert{},
},
{
newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestRule("multiple-firing", 0),
[][]datasource.Metric{
{
metricWithLabels(t, "name", "foo"),
metricWithLabels(t, "name", "foo1"),
metricWithLabels(t, "name", "foo2"),
},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateFiring},
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
},
},
{
newTestRule("multiple-steps-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo1")},
{metricWithLabels(t, "name", "foo2")},
},
// 1: fire first alert
// 2: fire second alert, set first inactive
// 3: fire third alert, set second inactive, delete first one
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateInactive},
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
},
},
{
newTestRule("duplicate", 0),
[][]datasource.Metric{
{
// metrics with the same labelset should result in one alert
metricWithLabels(t, "name", "foo", "type", "bar"),
metricWithLabels(t, "type", "bar", "name", "foo"),
},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo", "type", "bar")): {State: notifier.StateFiring},
},
},
{
newTestRule("for-pending", time.Minute),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
},
},
{
newTestRule("for-fired", time.Millisecond),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestRule("for-pending=>inactive", time.Millisecond),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
},
},
{
newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.group = fakeGroup
for _, step := range tc.steps {
fq.reset()
fq.add(step...)
if err := tc.rule.Exec(context.TODO(), fq); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
time.Sleep(time.Millisecond)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
}
for key, exp := range tc.expAlerts {
got, ok := tc.rule.alerts[key]
if !ok {
t.Fatalf("expected to have key %d", key)
}
if got.State != exp.State {
t.Fatalf("expected state %d; got %d", exp.State, got.State)
}
}
})
}
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
type fakeQuerier struct {
metrics []datasource.Metric
}
func (fq *fakeQuerier) reset() {
fq.metrics = fq.metrics[:0]
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.metrics = append(fq.metrics, metrics...)
}
func (fq fakeQuerier) Query(ctx context.Context, query string) ([]datasource.Metric, error) {
return fq.metrics, nil
}
func TestRule_Restore(t *testing.T) {
testCases := []struct {
rule *Rule
metrics []datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestRuleWithLabels("no extra labels"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
),
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("metric labels"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
"foo", "bar",
"namespace", "baz",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
)): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("rule labels", "source", "vm"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
"foo", "bar",
"namespace", "baz",
// following pair supposed to be dropped
"source", "vm",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
)): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("multiple alerts"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-1",
),
metricWithValueAndLabels(t, float64(time.Now().Truncate(2*time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-2",
),
metricWithValueAndLabels(t, float64(time.Now().Truncate(3*time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-3",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
Start: time.Now().Truncate(2 * time.Hour)},
hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
Start: time.Now().Truncate(3 * time.Hour)},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.group = fakeGroup
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
t.Fatalf("unexpected err: %s", err)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
}
for key, exp := range tc.expAlerts {
got, ok := tc.rule.alerts[key]
if !ok {
t.Fatalf("expected to have key %d", key)
}
if got.State != exp.State {
t.Fatalf("expected state %d; got %d", exp.State, got.State)
}
if got.Start != exp.Start {
t.Fatalf("expected Start %v; got %v", exp.Start, got.Start)
}
}
})
}
}
func newTestRuleWithLabels(name string, labels ...string) *Rule {
r := newTestRule(name, 0)
r.Labels = make(map[string]string)
for i := 0; i < len(labels); i += 2 {
r.Labels[labels[i]] = labels[i+1]
}
return r
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Value = value
return m
}

View File

@@ -0,0 +1,19 @@
groups:
- name: group
rules:
- alert: InvalidAnnotations
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }"
description: "{{$labels}}"
- alert: UnkownAnnotationsFunction
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ value|query }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,13 @@
groups:
- name: duplicatedGroupDiffFiles
rules:
- alert: VMRows
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value|humanize }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,22 @@
groups:
- name: sameGroup
rules:
- alert: alert
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- name: sameGroup
rules:
- alert: alert
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,13 @@
groups:
- name: duplicatedGroupDiffFiles
rules:
- alert: VMRows
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,11 @@
groups:
- name: group
rules:
- alert: UnkownLabelFunction
for: 5m
expr: vm_rows > 0
labels:
label: bar
summary: "{{ value|query }}"
annotations:
description: "{{$labels}}"

28
app/vmalert/testdata/rules0-bad.rules vendored Normal file
View File

@@ -0,0 +1,28 @@
groups:
- name: group
rules:
- alert: InvalidExpr
for: 5m
expr: vm_rows{ > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- alert: EmptyExpr
for: 5m
expr: ""
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- alert: ""
for: 5m
expr: vm_rows > 0
labels:
label: foo
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

23
app/vmalert/testdata/rules0-good.rules vendored Normal file
View File

@@ -0,0 +1,23 @@
groups:
- name: groupGorSingleAlert
rules:
- alert: VMRows
for: 10s
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value|humanize }}"
description: "{{$labels}}"
- name: TestGroup
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1
annotations:
summary: "Too high connection number for {{$labels.instance}}"
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: ExampleAlertAlwaysFiring
expr: sum by(job)
(up == 1)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

134
app/vmalert/web.go Normal file
View File

@@ -0,0 +1,134 @@
package main
import (
"encoding/json"
"fmt"
"net/http"
"sort"
"strconv"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
)
// APIAlert has info for an alert.
type APIAlert struct {
ID string `json:"id"`
Name string `json:"name"`
Group string `json:"group"`
Expression string `json:"expression"`
State string `json:"state"`
Value string `json:"value"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
ActiveAt time.Time `json:"activeAt"`
}
type requestHandler struct {
groups []Group
}
var pathList = [][]string{
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupName/alertID/status", "get alert status by ID"},
// /metrics is served by httpserver by default
{"/metrics", "list of application metrics"},
}
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
resph := responseHandler{w}
switch r.URL.Path {
case "/":
for _, path := range pathList {
p, doc := path[0], path[1]
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
}
return true
case "/api/v1/alerts":
resph.handle(rh.list())
return true
default:
// /api/v1/<groupName>/<alertID>/status
if strings.HasSuffix(r.URL.Path, "/status") {
resph.handle(rh.alert(r.URL.Path))
return true
}
return false
}
}
type listAlertsResponse struct {
Data struct {
Alerts []*APIAlert `json:"alerts"`
} `json:"data"`
Status string `json:"status"`
}
func (rh *requestHandler) list() ([]byte, error) {
lr := listAlertsResponse{Status: "success"}
for _, g := range rh.groups {
for _, r := range g.Rules {
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
}
}
// sort list of alerts for deterministic output
sort.Slice(lr.Data.Alerts, func(i, j int) bool {
return lr.Data.Alerts[i].ID < lr.Data.Alerts[j].ID
})
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
StatusCode: http.StatusInternalServerError,
}
}
return b, nil
}
func (rh *requestHandler) alert(path string) ([]byte, error) {
parts := strings.SplitN(strings.TrimPrefix(path, "/api/v1/"), "/", 3)
if len(parts) != 3 {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`path %q cointains /status suffix but doesn't match pattern "/group/alert/status"`, path),
StatusCode: http.StatusBadRequest,
}
}
group := strings.TrimRight(parts[0], "/")
idStr := strings.TrimRight(parts[1], "/")
id, err := strconv.ParseUint(idStr, 10, 0)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`cannot parse int from %q`, idStr),
StatusCode: http.StatusBadRequest,
}
}
for _, g := range rh.groups {
if g.Name != group {
continue
}
for _, rule := range g.Rules {
if apiAlert := rule.AlertAPI(id); apiAlert != nil {
return json.Marshal(apiAlert)
}
}
}
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`cannot find alert %s in %q`, idStr, group),
StatusCode: http.StatusNotFound,
}
}
// responseHandler wrapper on http.ResponseWriter with sugar
type responseHandler struct{ http.ResponseWriter }
func (w responseHandler) handle(b []byte, err error) {
if err != nil {
httpserver.Errorf(w, "%s", err)
return
}
w.Header().Set("Content-Type", "application/json")
w.Write(b)
}

72
app/vmalert/web_test.go Normal file
View File

@@ -0,0 +1,72 @@
package main
import (
"encoding/json"
"net/http"
"net/http/httptest"
"reflect"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestHandler(t *testing.T) {
rule := &Rule{
Name: "alert",
alerts: map[uint64]*notifier.Alert{
0: {},
},
}
rh := &requestHandler{
groups: []Group{{
Name: "group",
Rules: []*Rule{rule},
}},
}
getResp := func(url string, to interface{}, code int) {
t.Helper()
resp, err := http.Get(url)
if err != nil {
t.Errorf("unexpected err %s", err)
}
if code != resp.StatusCode {
t.Errorf("unexpected status code %d want %d", resp.StatusCode, code)
}
defer func() {
if err := resp.Body.Close(); err != nil {
t.Errorf("err closing body %s", err)
}
}()
if to != nil {
if err = json.NewDecoder(resp.Body).Decode(to); err != nil {
t.Errorf("unexpected err %s", err)
}
}
}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rh.handler(w, r) }))
defer ts.Close()
t.Run("/api/v1/alerts", func(t *testing.T) {
lr := listAlertsResponse{}
getResp(ts.URL+"/api/v1/alerts", &lr, 200)
if length := len(lr.Data.Alerts); length != 1 {
t.Errorf("expected 1 alert got %d", length)
}
})
t.Run("/api/v1/group/0/status", func(t *testing.T) {
alert := &APIAlert{}
getResp(ts.URL+"/api/v1/group/0/status", alert, 200)
expAlert := rule.newAlertAPI(*rule.alerts[0])
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}
})
t.Run("/api/v1/group/1/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/group/1/status", nil, 404)
})
t.Run("/api/v1/unknown-group/0/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/unknown-group/0/status", nil, 404)
})
t.Run("/", func(t *testing.T) {
getResp(ts.URL, nil, 200)
})
}

74
app/vmauth/Makefile Normal file
View File

@@ -0,0 +1,74 @@
# All these commands must run from repository root.
vmauth:
APP_NAME=vmauth $(MAKE) app-local
vmauth-race:
APP_NAME=vmauth RACE=-race $(MAKE) app-local
vmauth-prod:
APP_NAME=vmauth $(MAKE) app-via-docker
vmauth-pure-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-pure
vmauth-amd64-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-amd64
vmauth-arm-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-arm
vmauth-arm64-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-arm64
vmauth-ppc64le-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-ppc64le
vmauth-386-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-386
package-vmauth:
APP_NAME=vmauth $(MAKE) package-via-docker
package-vmauth-pure:
APP_NAME=vmauth $(MAKE) package-via-docker-pure
package-vmauth-amd64:
APP_NAME=vmauth $(MAKE) package-via-docker-amd64
package-vmauth-arm:
APP_NAME=vmauth $(MAKE) package-via-docker-arm
package-vmauth-arm64:
APP_NAME=vmauth $(MAKE) package-via-docker-arm64
package-vmauth-ppc64le:
APP_NAME=vmauth $(MAKE) package-via-docker-ppc64le
package-vmauth-386:
APP_NAME=vmauth $(MAKE) package-via-docker-386
publish-vmauth:
APP_NAME=vmauth $(MAKE) publish-via-docker
run-vmauth:
APP_NAME=vmauth \
$(MAKE) run-via-docker
vmauth-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-amd64 ./app/vmauth
vmauth-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm ./app/vmauth
vmauth-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm64 ./app/vmauth
vmauth-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-ppc64le ./app/vmauth
vmauth-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-386 ./app/vmauth
vmauth-pure:
APP_NAME=vmauth $(MAKE) app-local-pure

139
app/vmauth/README.md Normal file
View File

@@ -0,0 +1,139 @@
## vmauth
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
### Quick start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
```
/path/to/vmauth -auth.config=/path/to/auth/config.yml
```
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
Docker images for `vmauth` are available at [https://hub.docker.com/r/victoriametrics/vmauth/tags].
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, accounting, limits, etc.
### Auth config
Auth config is represented in the following simple `yml` format:
```yml
# Arbitrary number of usernames may be put here.
# Usernames must be unique.
users:
# The user for querying local single-node VictoriaMetrics.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
- username: "cluster-select-account-123"
password: "***"
url_prefix: "http://vmselect:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
- username: "cluster-insert-account-42"
password: "***"
url_prefix: "http://vminsert:8480/insert/42/prometheus"
```
### Security
Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable https. This can be done by passing the following `-tls*` command-line flags to `vmauth`:
```
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
```
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
### Monitoring
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
either via [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or via Prometheus, so the exported metrics could be analyzed later.
### How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmauth` from the root folder of the repository.
It builds `vmauth` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmauth-prod` from the root folder of the repository.
It builds `vmauth-prod` binary and puts it into the `bin` folder.
#### Building docker images
Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmauth
```
### Profiling
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
* Memory profile. It can be collected with the following command:
```bash
curl -s http://<vmauth-host>:8427/debug/pprof/heap > mem.pprof
```
* CPU profile. It can be collected with the following command:
```bash
curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
```
The command for collecting CPU profile waits for 30 seconds before returning.
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).

127
app/vmauth/auth_config.go Normal file
View File

@@ -0,0 +1,127 @@
package main
import (
"flag"
"fmt"
"io/ioutil"
"net/url"
"strings"
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
"gopkg.in/yaml.v2"
)
var (
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md "+
"for details on the format of this auth config")
)
// AuthConfig represents auth config.
type AuthConfig struct {
Users []UserInfo `yaml:"users"`
}
// UserInfo is user information read from authConfigPath
type UserInfo struct {
Username string `yaml:"username"`
Password string `yaml:"password"`
URLPrefix string `yaml:"url_prefix"`
requests *metrics.Counter
}
func initAuthConfig() {
if len(*authConfigPath) == 0 {
logger.Panicf("FATAL: missing required `-auth.config` command-line flag")
}
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Panicf("FATAL: cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
}
authConfig.Store(m)
stopCh = make(chan struct{})
authConfigWG.Add(1)
go func() {
defer authConfigWG.Done()
authConfigReloader()
}()
}
func stopAuthConfig() {
close(stopCh)
authConfigWG.Wait()
}
func authConfigReloader() {
sighupCh := procutil.NewSighupChan()
for {
select {
case <-stopCh:
return
case <-sighupCh:
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err)
continue
}
authConfig.Store(m)
}
}
}
var authConfig atomic.Value
var authConfigWG sync.WaitGroup
var stopCh chan struct{}
func readAuthConfig(path string) (map[string]*UserInfo, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("cannot read %q: %s", path, err)
}
m, err := parseAuthConfig(data)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %s", path, err)
}
logger.Infof("Loaded information about %d users from %q", len(m), path)
return m, nil
}
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
var ac AuthConfig
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %s", err)
}
uis := ac.Users
if len(uis) == 0 {
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
}
m := make(map[string]*UserInfo, len(uis))
for i := range uis {
ui := &uis[i]
if m[ui.Username] != nil {
return nil, fmt.Errorf("duplicate username found; username: %q", ui.Username)
}
urlPrefix := ui.URLPrefix
// Remove trailing '/' from urlPrefix
for strings.HasSuffix(urlPrefix, "/") {
urlPrefix = urlPrefix[:len(urlPrefix)-1]
}
// Validate urlPrefix
target, err := url.Parse(urlPrefix)
if err != nil {
return nil, fmt.Errorf("invalid `url_prefix: %q`: %s", urlPrefix, err)
}
if target.Scheme != "http" && target.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
}
ui.URLPrefix = urlPrefix
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
m[ui.Username] = ui
}
return m, nil
}

View File

@@ -0,0 +1,112 @@
package main
import (
"reflect"
"testing"
)
func TestParseAuthConfigFailure(t *testing.T) {
f := func(s string) {
t.Helper()
_, err := parseAuthConfig([]byte(s))
if err == nil {
t.Fatalf("expecting non-nil error")
}
}
// Empty config
f(``)
// Invalid entry
f(`foobar`)
f(`foobar: baz`)
// Empty users
f(`users: []`)
// Missing url_prefix
f(`
users:
- username: foo
`)
// Invalid url_prefix
f(`
users:
- username: foo
url_prefix: bar
`)
f(`
users:
- username: foo
url_prefix: ftp://bar
`)
f(`
users:
- username: foo
url_prefix: //bar
`)
// Duplicate users
f(`
users:
- username: foo
url_prefix: http://foo.bar
- username: bar
url_prefix: http://xxx.yyy
- username: foo
url_prefix: https://sss.sss
`)
}
func TestParseAuthConfigSuccess(t *testing.T) {
f := func(s string, expectedAuthConfig map[string]*UserInfo) {
t.Helper()
m, err := parseAuthConfig([]byte(s))
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
removeMetrics(m)
if !reflect.DeepEqual(m, expectedAuthConfig) {
t.Fatalf("unexpected auth config\ngot\n%v\nwant\n%v", m, expectedAuthConfig)
}
}
// Single user
f(`
users:
- username: foo
password: bar
url_prefix: http://aaa:343/bbb
`, map[string]*UserInfo{
"foo": {
Username: "foo",
Password: "bar",
URLPrefix: "http://aaa:343/bbb",
},
})
// Multiple users
f(`
users:
- username: foo
url_prefix: http://foo
- username: bar
url_prefix: https://bar/x///
`, map[string]*UserInfo{
"foo": {
Username: "foo",
URLPrefix: "http://foo",
},
"bar": {
Username: "bar",
URLPrefix: "https://bar/x",
},
})
}
func removeMetrics(m map[string]*UserInfo) {
for _, info := range m {
info.requests = nil
}
}

View File

@@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8427
ENTRYPOINT ["/vmauth-prod"]
ARG src_binary
COPY $src_binary ./vmauth-prod

79
app/vmauth/main.go Normal file
View File

@@ -0,0 +1,79 @@
package main
import (
"flag"
"net/http"
"net/http/httputil"
"net/url"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
var (
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
)
func main() {
envflag.Parse()
buildinfo.Init()
logger.Init()
logger.Infof("starting vmauth at %q...", *httpListenAddr)
startTime := time.Now()
initAuthConfig()
go httpserver.Serve(*httpListenAddr, requestHandler)
logger.Infof("started vmauth in %.3f seconds", time.Since(startTime).Seconds())
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
startTime = time.Now()
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
stopAuthConfig()
logger.Infof("successfully stopped vmauth in %.3f seconds", time.Since(startTime).Seconds())
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
username, password, ok := r.BasicAuth()
if !ok {
httpserver.Errorf(w, "Missing `Authorization: Basic *` header")
return true
}
ac := authConfig.Load().(map[string]*UserInfo)
info := ac[username]
if info == nil || info.Password != password {
httpserver.Errorf(w, "Cannot find the provided username %q or password in config", username)
return true
}
info.requests.Inc()
targetURL := createTargetURL(info.URLPrefix, r.URL)
if _, err := url.Parse(targetURL); err != nil {
httpserver.Errorf(w, "Invalid targetURL=%q: %s", targetURL, err)
return true
}
r.Header.Set("vm-target-url", targetURL)
reverseProxy.ServeHTTP(w, r)
return true
}
var reverseProxy = &httputil.ReverseProxy{
Director: func(r *http.Request) {
targetURL := r.Header.Get("vm-target-url")
target, err := url.Parse(targetURL)
if err != nil {
logger.Panicf("BUG: unexpected error when parsing targetURL=%q: %s", targetURL, err)
}
r.URL = target
},
FlushInterval: time.Second,
ErrorLog: logger.StdErrorLogger(),
}

16
app/vmauth/target_url.go Normal file
View File

@@ -0,0 +1,16 @@
package main
import (
"net/url"
"path"
"strings"
)
func createTargetURL(prefix string, u *url.URL) string {
// Prevent from attacks with using `..` in r.URL.Path
u.Path = path.Clean(u.Path)
if !strings.HasPrefix(u.Path, "/") {
u.Path = "/" + u.Path
}
return prefix + u.RequestURI()
}

View File

@@ -0,0 +1,26 @@
package main
import (
"net/url"
"testing"
)
func TestCreateTargetURL(t *testing.T) {
f := func(prefix, requestURI, expectedTarget string) {
t.Helper()
u, err := url.Parse(requestURI)
if err != nil {
t.Fatalf("cannot parse %q: %s", requestURI, err)
}
target := createTargetURL(prefix, u)
if target != expectedTarget {
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
}
}
f("http://foo.bar", "", "http://foo.bar/.")
f("http://foo.bar", "/", "http://foo.bar/")
f("http://foo.bar", "a/b?c=d", "http://foo.bar/a/b?c=d")
f("https://sss:3894/x/y", "/z", "https://sss:3894/x/y/z")
f("https://sss:3894/x/y", "/../../aaa", "https://sss:3894/x/y/aaa")
f("https://sss:3894/x/y", "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
}

View File

@@ -3,6 +3,9 @@
vmbackup:
APP_NAME=vmbackup $(MAKE) app-local
vmbackup-race:
APP_NAME=vmbackup RACE=-race $(MAKE) app-local
vmbackup-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker

View File

@@ -140,14 +140,28 @@ Run `vmbackup -help` in order to see all the available options:
-dst string
Where to put the backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-maxBytesPerSecond int
The maximum upload speed. There is no limit if it is set to 0
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-origin string
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup.Example: http://victoriametrics:8428/snaphsot/create
-snapshot.deleteURL string
VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted.Example: http://victoriametrics:8428/snaphsot/delete
-snapshotName string
Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots
-storageDataPath string
@@ -164,7 +178,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmbackup` from the root folder of the repository.
It builds `vmbackup` binary and puts it into the `bin` folder.
@@ -179,3 +193,10 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmbackup
```

View File

@@ -1,7 +1,6 @@
ARG certs_image
FROM $certs_image AS certs
FROM scratch
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG base_image
FROM $base_image
ENTRYPOINT ["/vmbackup-prod"]
ARG src_binary
COPY $src_binary ./vmbackup-prod
ENTRYPOINT ["/vmbackup-prod"]

View File

@@ -4,7 +4,9 @@ import (
"flag"
"fmt"
"os"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmbackup/snapshot"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
@@ -14,9 +16,13 @@ import (
)
var (
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup."+
"Example: http://victoriametrics:8428/snaphsot/create")
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted."+
"Example: http://victoriametrics:8428/snaphsot/delete")
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
origin = flag.String("origin", "", "Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups")
@@ -29,6 +35,34 @@ func main() {
envflag.Parse()
buildinfo.Init()
if len(*snapshotCreateURL) > 0 {
logger.Infof("%s", "Snapshots enabled")
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
if len(*snapshotDeleteURL) <= 0 {
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
if err != nil {
logger.Fatalf("Failed to set snapshot.deleteURL flag: %v", err)
}
}
logger.Infof("Snapshot delete url %s", *snapshotDeleteURL)
name, err := snapshot.Create(*snapshotCreateURL)
if err != nil {
logger.Fatalf("%s", err)
}
err = flag.Set("snapshotName", name)
if err != nil {
logger.Fatalf("Failed to set snapshotName flag: %v", err)
}
defer func() {
err := snapshot.Delete(*snapshotDeleteURL, name)
if err != nil {
logger.Fatalf("%s", err)
}
}()
}
srcFS, err := newSrcFS()
if err != nil {
logger.Fatalf("%s", err)
@@ -67,7 +101,7 @@ See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/a
func newSrcFS() (*fslocal.FS, error) {
if len(*snapshotName) == 0 {
return nil, fmt.Errorf("`-snapshotName` cannot be empty")
return nil, fmt.Errorf("`-snapshotName` or `-snapshot.createURL` must be provided")
}
snapshotPath := *storageDataPath + "/snapshots/" + *snapshotName

View File

@@ -0,0 +1,91 @@
package snapshot
import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type snapshot struct {
Status string `json:"status"`
Snapshot string `json:"snapshot"`
Msg string `json:"msg"`
}
// Create creates a snapshot and the provided api endpoint and returns
// the snapshot name
func Create(createSnapshotURL string) (string, error) {
logger.Infof("%s", "Creating snapshot")
u, err := url.Parse(createSnapshotURL)
if err != nil {
return "", err
}
resp, err := http.Get(u.String())
if err != nil {
return "", err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return "", err
}
if snap.Status == "ok" {
logger.Infof("Snapshot %s created", snap.Snapshot)
return snap.Snapshot, nil
} else if snap.Status == "error" {
return "", errors.New(snap.Msg)
} else {
return "", fmt.Errorf("Unkown status: %v", snap.Status)
}
}
// Delete deletes a snapshot and the provided api endpoint returns any failure
func Delete(deleteSnapshotURL string, snapshotName string) error {
logger.Infof("Deleting snapshot %s", snapshotName)
formData := url.Values{
"snapshot": {snapshotName},
}
u, err := url.Parse(deleteSnapshotURL)
if err != nil {
return err
}
resp, err := http.PostForm(u.String(), formData)
if err != nil {
return err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return err
}
if snap.Status == "ok" {
logger.Infof("Snapshot %s deleted", snapshotName)
return nil
} else if snap.Status == "error" {
return errors.New(snap.Msg)
} else {
return fmt.Errorf("Unkown status: %v", snap.Status)
}
}

View File

@@ -0,0 +1,106 @@
package snapshot
import (
"io"
"net/http"
"net/http/httptest"
"testing"
)
func TestCreateSnapshot(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/snapshot/create" {
_, err := io.WriteString(w, `{"status":"ok","snapshot":"mysnapshot"}`)
if err != nil {
t.Fatalf("Failed to write response output: %v", err)
}
} else {
t.Fatalf("Invalid path, got %v", r.URL.Path)
}
})
server := httptest.NewServer(http.HandlerFunc(handler))
defer server.Close()
snapshotName, err := Create(server.URL + "/snapshot/create")
if err != nil {
t.Fatalf("Failed taking snapshot: %v", err)
}
if snapshotName != "mysnapshot" {
t.Fatalf("Snapshot name is not correct, got %v", snapshotName)
}
}
func TestCreateSnapshotFailed(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/snapshot/create" {
_, err := io.WriteString(w, `{"status":"error","msg":"I am unwell"}`)
if err != nil {
t.Fatalf("Failed to write response output: %v", err)
}
} else {
t.Fatalf("Invalid path, got %v", r.URL.Path)
}
})
server := httptest.NewServer(http.HandlerFunc(handler))
defer server.Close()
snapshotName, err := Create(server.URL + "/snapshot/create")
if err == nil {
t.Fatalf("Snapshot did not fail, got snapshot: %v", snapshotName)
}
}
func TestDeleteSnapshot(t *testing.T) {
snapshotName := "mysnapshot"
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/snapshot/delete" {
_, err := io.WriteString(w, `{"status":"ok"}`)
if err != nil {
t.Fatalf("Failed to write response output: %v", err)
}
} else {
t.Fatalf("Invalid path, got %v", r.URL.Path)
}
if r.FormValue("snapshot") != snapshotName {
t.Fatalf("Invalid snapshot name, got %v", snapshotName)
}
})
server := httptest.NewServer(http.HandlerFunc(handler))
defer server.Close()
err := Delete(server.URL+"/snapshot/delete", snapshotName)
if err != nil {
t.Fatalf("Failed to delete snapshot: %v", err)
}
}
func TestDeleteSnapshotFailed(t *testing.T) {
snapshotName := "mysnapshot"
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/snapshot/delete" {
_, err := io.WriteString(w, `{"status":"error", "msg":"failed to delete"}`)
if err != nil {
t.Fatalf("Failed to write response output: %v", err)
}
} else {
t.Fatalf("Invalid path, got %v", r.URL.Path)
}
if r.FormValue("snapshot") != snapshotName {
t.Fatalf("Invalid snapshot name, got %v", snapshotName)
}
})
server := httptest.NewServer(http.HandlerFunc(handler))
defer server.Close()
err := Delete(server.URL+"/snapshot/delete", snapshotName)
if err == nil {
t.Fatalf("Snapshot should have failed, got: %v", err)
}
}

View File

@@ -0,0 +1,44 @@
package csvimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="csvimport"}`)
rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="csvimport"}`)
)
// InsertHandler processes /api/v1/import/csv requests.
func InsertHandler(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows)
})
})
}
func insertRows(rows []parser.Row) error {
ctx := common.GetInsertCtx()
defer common.PutInsertCtx(ctx)
ctx.Reset(len(rows))
for i := range rows {
r := &rows[i]
ctx.Labels = ctx.Labels[:0]
ctx.AddLabel("", r.Metric)
for j := range r.Tags {
tag := &r.Tags[j]
ctx.AddLabel(tag.Key, tag.Value)
}
ctx.WriteDataPoint(nil, ctx.Labels, r.Timestamp, r.Value)
}
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return ctx.FlushBufs()
}

View File

@@ -2,6 +2,7 @@ package influx
import (
"flag"
"io"
"net/http"
"runtime"
"sync"
@@ -24,12 +25,26 @@ var (
rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="influx"}`)
)
// InsertHandler processes remote write for influx line protocol.
// InsertHandlerForReader processes remote write for influx line protocol.
//
// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
func InsertHandlerForReader(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, false, "", "", insertRows)
})
}
// InsertHandlerForHTTP processes remote write for influx line protocol.
//
// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
func InsertHandler(req *http.Request) error {
func InsertHandlerForHTTP(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
q := req.URL.Query()
precision := q.Get("precision")
// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
db := q.Get("db")
return parser.ParseStream(req.Body, isGzipped, precision, db, insertRows)
})
}

View File

@@ -6,6 +6,7 @@ import (
"net/http"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/opentsdb"
@@ -15,8 +16,10 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
@@ -25,6 +28,7 @@ import (
var (
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
"Usually :4242 must be set. Doesn't work if empty")
@@ -33,6 +37,7 @@ var (
)
var (
influxServer *influxserver.Server
graphiteServer *graphiteserver.Server
opentsdbServer *opentsdbserver.Server
opentsdbhttpServer *opentsdbhttpserver.Server
@@ -43,6 +48,9 @@ func Init() {
storage.SetMaxLabelsPerTimeseries(*maxLabelsPerTimeseries)
writeconcurrencylimiter.Init()
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
}
if len(*graphiteListenAddr) > 0 {
graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
}
@@ -58,6 +66,9 @@ func Init() {
// Stop stops vminsert.
func Stop() {
promscrape.Stop()
if len(*influxListenAddr) > 0 {
influxServer.MustStop()
}
if len(*graphiteListenAddr) > 0 {
graphiteServer.MustStop()
}
@@ -91,9 +102,18 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/csv":
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/write", "/api/v2/write":
influxWriteRequests.Inc()
if err := influx.InsertHandler(r); err != nil {
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
return true
@@ -106,6 +126,16 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
influxQueryRequests.Inc()
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
return true
case "/targets":
promscrapeTargetsRequests.Inc()
w.Header().Set("Content-Type", "text/plain")
promscrape.WriteHumanReadableTargetsStatus(w)
return true
case "/-/reload":
promscrapeConfigReloadRequests.Inc()
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusNoContent)
return true
default:
// This is not our link
return false
@@ -113,14 +143,21 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
var (
prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="prometheus"}`)
prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="prometheus"}`)
prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
prometheusWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="promremotewrite"}`)
vmimportRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/import", protocol="vm"}`)
vmimportErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/import", protocol="vm"}`)
vmimportRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/import", protocol="vmimport"}`)
vmimportErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/import", protocol="vmimport"}`)
csvimportRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/import/csv", protocol="csvimport"}`)
csvimportErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/import/csv", protocol="csvimport"}`)
influxWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/write", protocol="influx"}`)
influxWriteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/write", protocol="influx"}`)
influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/query", protocol="influx"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vm_http_requests_total{path="/targets"}`)
promscrapeConfigReloadRequests = metrics.NewCounter(`vm_http_requests_total{path="/-/reload"}`)
)

View File

@@ -17,22 +17,38 @@ var (
rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="promscrape"}`)
)
// Push pushes wr to to storage.
const maxRowsPerBlock = 10000
// Push pushes wr to storage.
func Push(wr *prompbmarshal.WriteRequest) {
ctx := getPushCtx()
defer putPushCtx(ctx)
timeseries := wr.Timeseries
tss := wr.Timeseries
for len(tss) > 0 {
// Process big tss in smaller blocks in order to reduce maxmimum memory usage
tssBlock := tss
if len(tssBlock) > maxRowsPerBlock {
tssBlock = tss[:maxRowsPerBlock]
tss = tss[maxRowsPerBlock:]
} else {
tss = nil
}
ctx.push(tssBlock)
}
}
func (ctx *pushCtx) push(tss []prompbmarshal.TimeSeries) {
rowsLen := 0
for i := range timeseries {
rowsLen += len(timeseries[i].Samples)
for i := range tss {
rowsLen += len(tss[i].Samples)
}
ic := &ctx.Common
ic.Reset(rowsLen)
rowsTotal := 0
labels := ctx.labels[:0]
for i := range timeseries {
ts := &timeseries[i]
for i := range tss {
ts := &tss[i]
labels = labels[:0]
for j := range ts.Labels {
label := &ts.Labels[j]

View File

@@ -3,6 +3,9 @@
vmrestore:
APP_NAME=vmrestore $(MAKE) app-local
vmrestore-race:
APP_NAME=vmrestore RACE=-race $(MAKE) app-local
vmrestore-prod:
APP_NAME=vmrestore $(MAKE) app-via-docker

View File

@@ -47,12 +47,24 @@ Run `vmrestore -help` in order to see all the available options:
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
-customS3Endpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-maxBytesPerSecond int
The maximum download speed. There is no limit if it is set to 0
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-skipBackupCompleteCheck
Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file
-src string
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-storageDataPath string
@@ -69,7 +81,7 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmrestore` from the root folder of the repository.
It builds `vmrestore` binary and puts it into the `bin` folder.
@@ -84,3 +96,10 @@ It is recommended using [binary releases](https://github.com/VictoriaMetrics/Vic
Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmrestore
```

View File

@@ -1,7 +1,6 @@
ARG certs_image
FROM $certs_image AS certs
FROM scratch
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG base_image
FROM $base_image
ENTRYPOINT ["/vmrestore-prod"]
ARG src_binary
COPY $src_binary ./vmrestore-prod
ENTRYPOINT ["/vmrestore-prod"]

View File

@@ -8,11 +8,9 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
@@ -24,7 +22,7 @@ var (
maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+
"It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration")
maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached")
resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetCache call")
resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call")
)
func getDefaultMaxConcurrentRequests() int {
@@ -43,9 +41,6 @@ func getDefaultMaxConcurrentRequests() int {
// Init initializes vmselect
func Init() {
tmpDirPath := *vmstorage.DataPath + "/tmp"
fs.RemoveDirContents(tmpDirPath)
netstorage.InitTmpBlocksDir(tmpDirPath)
promql.InitRollupResultCache(*vmstorage.DataPath + "/cache/rollupResult")
concurrencyCh = make(chan struct{}, *maxConcurrentRequests)
@@ -179,6 +174,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
return true
}
return true
case "/api/v1/status/tsdb":
tsdbStatusRequests.Inc()
if err := prometheus.TSDBStatusHandler(startTime, w, r); err != nil {
tsdbStatusErrors.Inc()
sendPrometheusError(w, r, err)
return true
}
return true
case "/api/v1/export":
exportRequests.Inc()
if err := prometheus.ExportHandler(startTime, w, r); err != nil {
@@ -191,7 +194,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
federateRequests.Inc()
if err := prometheus.FederateHandler(startTime, w, r); err != nil {
federateErrors.Inc()
httpserver.Errorf(w, "error int %q: %s", r.URL.Path, err)
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
return true
}
return true
@@ -233,7 +236,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
}
func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
logger.Errorf("error in %q: %s", r.RequestURI, err)
logger.Warnf("error in %q: %s", r.RequestURI, err)
w.Header().Set("Content-Type", "application/json")
statusCode := http.StatusUnprocessableEntity
@@ -266,6 +269,9 @@ var (
labelsCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels/count"}`)
labelsCountErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels/count"}`)
tsdbStatusRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/tsdb"}`)
tsdbStatusErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/tsdb"}`)
deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`)
deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`)

View File

@@ -53,9 +53,8 @@ type Results struct {
fetchData bool
deadline Deadline
tbf *tmpBlocksFile
packedTimeseries []packedTimeseries
sr *storage.Search
}
// Len returns the number of results in rss.
@@ -65,8 +64,12 @@ func (rss *Results) Len() int {
// Cancel cancels rss work.
func (rss *Results) Cancel() {
putTmpBlocksFile(rss.tbf)
rss.tbf = nil
rss.mustClose()
}
func (rss *Results) mustClose() {
putStorageSearch(rss.sr)
rss.sr = nil
}
// RunParallel runs in parallel f for all the results from rss.
@@ -76,10 +79,7 @@ func (rss *Results) Cancel() {
//
// rss becomes unusable after the call to RunParallel.
func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
defer func() {
putTmpBlocksFile(rss.tbf)
rss.tbf = nil
}()
defer rss.mustClose()
workersCount := 1 + len(rss.packedTimeseries)/32
if workersCount > gomaxprocs {
@@ -106,7 +106,7 @@ func (rss *Results) RunParallel(f func(rs *Result, workerID uint)) error {
err = fmt.Errorf("timeout exceeded during query execution: %s", rss.deadline.String())
break
}
if err = pts.Unpack(rss.tbf, rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
if err = pts.Unpack(rs, rss.tr, rss.fetchData, maxWorkersCount); err != nil {
break
}
if len(rs.Timestamps) == 0 && rss.fetchData {
@@ -156,18 +156,18 @@ var gomaxprocs = runtime.GOMAXPROCS(-1)
type packedTimeseries struct {
metricName string
addrs []tmpBlockAddr
brs []storage.BlockRef
}
// Unpack unpacks pts to dst.
func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
func (pts *packedTimeseries) Unpack(dst *Result, tr storage.TimeRange, fetchData bool, maxWorkersCount int) error {
dst.reset()
if err := dst.MetricName.Unmarshal(bytesutil.ToUnsafeBytes(pts.metricName)); err != nil {
return fmt.Errorf("cannot unmarshal metricName %q: %s", pts.metricName, err)
}
workersCount := 1 + len(pts.addrs)/32
workersCount := 1 + len(pts.brs)/32
if workersCount > maxWorkersCount {
workersCount = maxWorkersCount
}
@@ -175,19 +175,19 @@ func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.
logger.Panicf("BUG: workersCount cannot be zero")
}
sbs := make([]*sortBlock, 0, len(pts.addrs))
sbs := make([]*sortBlock, 0, len(pts.brs))
var sbsLock sync.Mutex
workCh := make(chan tmpBlockAddr, workersCount)
workCh := make(chan storage.BlockRef, workersCount)
doneCh := make(chan error)
// Start workers
for i := 0; i < workersCount; i++ {
go func() {
var err error
for addr := range workCh {
for br := range workCh {
sb := getSortBlock()
if err = sb.unpackFrom(tbf, addr, tr, fetchData); err != nil {
if err = sb.unpackFrom(br, tr, fetchData); err != nil {
break
}
@@ -204,10 +204,10 @@ func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.
}
// Feed workers with work
for _, addr := range pts.addrs {
workCh <- addr
for _, br := range pts.brs {
workCh <- br
}
pts.addrs = pts.addrs[:0]
pts.brs = pts.brs[:0]
close(workCh)
// Wait until workers finish
@@ -288,9 +288,15 @@ func mergeSortBlocks(dst *Result, sbh sortBlocksHeap) {
}
}
dst.Timestamps, dst.Values = storage.DeduplicateSamples(dst.Timestamps, dst.Values)
timestamps, values := storage.DeduplicateSamples(dst.Timestamps, dst.Values)
dedups := len(dst.Timestamps) - len(timestamps)
dedupsDuringSelect.Add(dedups)
dst.Timestamps = timestamps
dst.Values = values
}
var dedupsDuringSelect = metrics.NewCounter(`vm_deduplicated_samples_total{type="select"}`)
type sortBlock struct {
// b is used as a temporary storage for unpacked rows before they
// go to Timestamps and Values.
@@ -308,8 +314,8 @@ func (sb *sortBlock) reset() {
sb.NextIdx = 0
}
func (sb *sortBlock) unpackFrom(tbf *tmpBlocksFile, addr tmpBlockAddr, tr storage.TimeRange, fetchData bool) error {
tbf.MustReadBlockAt(&sb.b, addr)
func (sb *sortBlock) unpackFrom(br storage.BlockRef, tr storage.TimeRange, fetchData bool) error {
br.MustReadBlock(&sb.b, fetchData)
if fetchData {
if err := sb.b.UnmarshalData(); err != nil {
return fmt.Errorf("cannot unmarshal block: %s", err)
@@ -443,6 +449,15 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) {
return labelEntries, nil
}
// GetTSDBStatusForDate returns tsdb status according to https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats
func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) {
status, err := vmstorage.GetTSDBStatusForDate(date, topN)
if err != nil {
return nil, fmt.Errorf("error during tsdb status request: %s", err)
}
return status, nil
}
// GetSeriesCount returns the number of unique series.
func GetSeriesCount(deadline Deadline) (uint64, error) {
n, err := vmstorage.GetSeriesCount()
@@ -468,6 +483,8 @@ func putStorageSearch(sr *storage.Search) {
var ssPool sync.Pool
// ProcessSearchQuery performs sq on storage nodes until the given deadline.
//
// Results.RunParallel or Results.Cancel must be called on the returned Results.
func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadline) (*Results, error) {
// Setup search.
tfss, err := setupTfss(sq.TagFilterss)
@@ -483,60 +500,40 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline Deadli
defer vmstorage.WG.Done()
sr := getStorageSearch()
defer putStorageSearch(sr)
sr.Init(vmstorage.Storage, tfss, tr, fetchData, *maxMetricsPerSearch)
sr.Init(vmstorage.Storage, tfss, tr, *maxMetricsPerSearch)
tbf := getTmpBlocksFile()
m := make(map[string][]tmpBlockAddr)
m := make(map[string][]storage.BlockRef)
var orderedMetricNames []string
blocksRead := 0
bb := tmpBufPool.Get()
defer tmpBufPool.Put(bb)
for sr.NextMetricBlock() {
blocksRead++
bb.B = storage.MarshalBlock(bb.B[:0], sr.MetricBlock.Block)
addr, err := tbf.WriteBlockData(bb.B)
if err != nil {
putTmpBlocksFile(tbf)
return nil, fmt.Errorf("cannot write data block #%d to temporary blocks file: %s", blocksRead, err)
}
if time.Until(deadline.Deadline) < 0 {
putTmpBlocksFile(tbf)
return nil, fmt.Errorf("timeout exceeded while fetching data block #%d from storage: %s", blocksRead, deadline.String())
}
metricName := sr.MetricBlock.MetricName
m[string(metricName)] = append(m[string(metricName)], addr)
metricName := sr.MetricBlockRef.MetricName
brs := m[string(metricName)]
if len(brs) == 0 {
orderedMetricNames = append(orderedMetricNames, string(metricName))
}
m[string(metricName)] = append(brs, *sr.MetricBlockRef.BlockRef)
}
if err := sr.Error(); err != nil {
putTmpBlocksFile(tbf)
return nil, fmt.Errorf("search error after reading %d data blocks: %s", blocksRead, err)
}
if err := tbf.Finalize(); err != nil {
putTmpBlocksFile(tbf)
return nil, fmt.Errorf("cannot finalize temporary blocks file with %d blocks: %s", blocksRead, err)
}
var rss Results
rss.packedTimeseries = make([]packedTimeseries, len(m))
rss.tr = tr
rss.fetchData = fetchData
rss.deadline = deadline
rss.tbf = tbf
i := 0
for metricName, addrs := range m {
pts := &rss.packedTimeseries[i]
i++
pts.metricName = metricName
pts.addrs = addrs
pts := make([]packedTimeseries, len(orderedMetricNames))
for i, metricName := range orderedMetricNames {
pts[i] = packedTimeseries{
metricName: metricName,
brs: m[metricName],
}
}
// Sort rss.packedTimeseries by the first addr offset in order
// to reduce the number of disk seeks during unpacking in RunParallel.
// In this case tmpBlocksFile must be read almost sequentially.
sort.Slice(rss.packedTimeseries, func(i, j int) bool {
pts := rss.packedTimeseries
return pts[i].addrs[0].offset < pts[j].addrs[0].offset
})
rss.packedTimeseries = pts
rss.sr = sr
return &rss, nil
}
@@ -570,6 +567,7 @@ func setupTfss(tagFilterss [][]storage.TagFilter) ([]*storage.TagFilters, error)
}
}
tfss = append(tfss, tfs)
tfss = append(tfss, tfs.Finalize()...)
}
return tfss, nil
}

View File

@@ -1,185 +0,0 @@
package netstorage
import (
"fmt"
"io/ioutil"
"os"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
)
// InitTmpBlocksDir initializes directory to store temporary search results.
//
// It stores data in system-defined temporary directory if tmpDirPath is empty.
func InitTmpBlocksDir(tmpDirPath string) {
if len(tmpDirPath) == 0 {
tmpDirPath = os.TempDir()
}
tmpBlocksDir = tmpDirPath + "/searchResults"
fs.MustRemoveAll(tmpBlocksDir)
if err := fs.MkdirAllIfNotExist(tmpBlocksDir); err != nil {
logger.Panicf("FATAL: cannot create %q: %s", tmpBlocksDir, err)
}
}
var tmpBlocksDir string
func maxInmemoryTmpBlocksFile() int {
mem := memory.Allowed()
maxLen := mem / 1024
if maxLen < 64*1024 {
return 64 * 1024
}
if maxLen > 4*1024*1024 {
return 4 * 1024 * 1024
}
return maxLen
}
var _ = metrics.NewGauge(`vm_tmp_blocks_max_inmemory_file_size_bytes`, func() float64 {
return float64(maxInmemoryTmpBlocksFile())
})
type tmpBlocksFile struct {
buf []byte
f *os.File
r *fs.ReaderAt
offset uint64
}
func getTmpBlocksFile() *tmpBlocksFile {
v := tmpBlocksFilePool.Get()
if v == nil {
return &tmpBlocksFile{
buf: make([]byte, 0, maxInmemoryTmpBlocksFile()),
}
}
return v.(*tmpBlocksFile)
}
func putTmpBlocksFile(tbf *tmpBlocksFile) {
tbf.MustClose()
tbf.buf = tbf.buf[:0]
tbf.f = nil
tbf.r = nil
tbf.offset = 0
tmpBlocksFilePool.Put(tbf)
}
var tmpBlocksFilePool sync.Pool
type tmpBlockAddr struct {
offset uint64
size int
}
func (addr tmpBlockAddr) String() string {
return fmt.Sprintf("offset %d, size %d", addr.offset, addr.size)
}
var tmpBlocksFilesCreated = metrics.NewCounter(`vm_tmp_blocks_files_created_total`)
// WriteBlockData writes b to tbf.
//
// It returns errors since the operation may fail on space shortage
// and this must be handled.
func (tbf *tmpBlocksFile) WriteBlockData(b []byte) (tmpBlockAddr, error) {
var addr tmpBlockAddr
addr.offset = tbf.offset
addr.size = len(b)
tbf.offset += uint64(addr.size)
if len(tbf.buf)+len(b) <= cap(tbf.buf) {
// Fast path - the data fits tbf.buf
tbf.buf = append(tbf.buf, b...)
return addr, nil
}
// Slow path: flush the data from tbf.buf to file.
if tbf.f == nil {
f, err := ioutil.TempFile(tmpBlocksDir, "")
if err != nil {
return addr, err
}
tbf.f = f
tmpBlocksFilesCreated.Inc()
}
_, err := tbf.f.Write(tbf.buf)
tbf.buf = append(tbf.buf[:0], b...)
if err != nil {
return addr, fmt.Errorf("cannot write block to %q: %s", tbf.f.Name(), err)
}
return addr, nil
}
func (tbf *tmpBlocksFile) Finalize() error {
if tbf.f == nil {
return nil
}
fname := tbf.f.Name()
if _, err := tbf.f.Write(tbf.buf); err != nil {
return fmt.Errorf("cannot write the remaining %d bytes to %q: %s", len(tbf.buf), fname, err)
}
tbf.buf = tbf.buf[:0]
r, err := fs.OpenReaderAt(fname)
if err != nil {
logger.Panicf("FATAL: cannot open %q: %s", fname, err)
}
// Hint the OS that the file is read almost sequentiallly.
// This should reduce the number of disk seeks, which is important
// for HDDs.
r.MustFadviseSequentialRead(true)
tbf.r = r
return nil
}
func (tbf *tmpBlocksFile) MustReadBlockAt(dst *storage.Block, addr tmpBlockAddr) {
var buf []byte
if tbf.f == nil {
buf = tbf.buf[addr.offset : addr.offset+uint64(addr.size)]
} else {
bb := tmpBufPool.Get()
defer tmpBufPool.Put(bb)
bb.B = bytesutil.Resize(bb.B, addr.size)
tbf.r.MustReadAt(bb.B, int64(addr.offset))
buf = bb.B
}
tail, err := storage.UnmarshalBlock(dst, buf)
if err != nil {
logger.Panicf("FATAL: cannot unmarshal data at %s: %s", addr, err)
}
if len(tail) > 0 {
logger.Panicf("FATAL: unexpected non-empty tail left after unmarshaling data at %s; len(tail)=%d", addr, len(tail))
}
}
var tmpBufPool bytesutil.ByteBufferPool
func (tbf *tmpBlocksFile) MustClose() {
if tbf.f == nil {
return
}
if tbf.r != nil {
// tbf.r could be nil if Finalize wasn't called.
tbf.r.MustClose()
}
fname := tbf.f.Name()
// Remove the file at first, then close it.
// This way the OS shouldn't try to flush file contents to storage
// on close.
if err := os.Remove(fname); err != nil {
logger.Panicf("FATAL: cannot remove %q: %s", fname, err)
}
if err := tbf.f.Close(); err != nil {
logger.Panicf("FATAL: cannot close %q: %s", fname, err)
}
tbf.f = nil
}

View File

@@ -1,153 +0,0 @@
package netstorage
import (
"fmt"
"math/rand"
"os"
"reflect"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
)
func TestMain(m *testing.M) {
rand.Seed(time.Now().UnixNano())
tmpDir := "TestTmpBlocks"
InitTmpBlocksDir(tmpDir)
statusCode := m.Run()
if err := os.RemoveAll(tmpDir); err != nil {
logger.Panicf("cannot remove %q: %s", tmpDir, err)
}
os.Exit(statusCode)
}
func TestTmpBlocksFileSerial(t *testing.T) {
if err := testTmpBlocksFile(); err != nil {
t.Fatalf("unexpected error: %s", err)
}
}
func TestTmpBlocksFileConcurrent(t *testing.T) {
concurrency := 3
ch := make(chan error, concurrency)
for i := 0; i < concurrency; i++ {
go func() {
ch <- testTmpBlocksFile()
}()
}
for i := 0; i < concurrency; i++ {
select {
case err := <-ch:
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
case <-time.After(30 * time.Second):
t.Fatalf("timeout")
}
}
}
func testTmpBlocksFile() error {
createBlock := func() *storage.Block {
rowsCount := rand.Intn(8000) + 1
var timestamps, values []int64
ts := int64(rand.Intn(1023434))
for i := 0; i < rowsCount; i++ {
ts += int64(rand.Intn(1000) + 1)
timestamps = append(timestamps, ts)
values = append(values, int64(i*i+rand.Intn(20)))
}
tsid := &storage.TSID{
MetricID: 234211,
}
scale := int16(rand.Intn(123))
precisionBits := uint8(rand.Intn(63) + 1)
var b storage.Block
b.Init(tsid, timestamps, values, scale, precisionBits)
_, _, _ = b.MarshalData(0, 0)
return &b
}
for _, size := range []int{1024, 16 * 1024, maxInmemoryTmpBlocksFile() / 2, 2 * maxInmemoryTmpBlocksFile()} {
err := func() error {
tbf := getTmpBlocksFile()
defer putTmpBlocksFile(tbf)
// Write blocks until their summary size exceeds `size`.
var addrs []tmpBlockAddr
var blocks []*storage.Block
bb := tmpBufPool.Get()
defer tmpBufPool.Put(bb)
for tbf.offset < uint64(size) {
b := createBlock()
bb.B = storage.MarshalBlock(bb.B[:0], b)
addr, err := tbf.WriteBlockData(bb.B)
if err != nil {
return fmt.Errorf("cannot write block at offset %d: %s", tbf.offset, err)
}
if addr.offset+uint64(addr.size) != tbf.offset {
return fmt.Errorf("unexpected addr=%+v for offset %v", &addr, tbf.offset)
}
addrs = append(addrs, addr)
blocks = append(blocks, b)
}
if err := tbf.Finalize(); err != nil {
return fmt.Errorf("cannot finalize tbf: %s", err)
}
// Read blocks in parallel and verify them
concurrency := 2
workCh := make(chan int)
doneCh := make(chan error)
for i := 0; i < concurrency; i++ {
go func() {
doneCh <- func() error {
var b1 storage.Block
for idx := range workCh {
addr := addrs[idx]
b := blocks[idx]
if err := b.UnmarshalData(); err != nil {
return fmt.Errorf("cannot unmarshal data from the original block: %s", err)
}
b1.Reset()
tbf.MustReadBlockAt(&b1, addr)
if err := b1.UnmarshalData(); err != nil {
return fmt.Errorf("cannot unmarshal data from tbf: %s", err)
}
if b1.RowsCount() != b.RowsCount() {
return fmt.Errorf("unexpected number of rows in tbf block; got %d; want %d", b1.RowsCount(), b.RowsCount())
}
if !reflect.DeepEqual(b1.Timestamps(), b.Timestamps()) {
return fmt.Errorf("unexpected timestamps; got\n%v\nwant\n%v", b1.Timestamps(), b.Timestamps())
}
if !reflect.DeepEqual(b1.Values(), b.Values()) {
return fmt.Errorf("unexpected values; got\n%v\nwant\n%v", b1.Values(), b.Values())
}
}
return nil
}()
}()
}
for i := range addrs {
workCh <- i
}
close(workCh)
for i := 0; i < concurrency; i++ {
select {
case err := <-doneCh:
if err != nil {
return err
}
case <-time.After(time.Second):
return fmt.Errorf("timeout")
}
}
return nil
}()
if err != nil {
return err
}
}
return nil
}

View File

@@ -3,6 +3,7 @@ package prometheus
import (
"flag"
"fmt"
"io"
"math"
"net/http"
"runtime"
@@ -15,9 +16,10 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/fastjson/fastfloat"
"github.com/valyala/quicktemplate"
)
@@ -28,7 +30,12 @@ var (
maxQueryDuration = flag.Duration("search.maxQueryDuration", time.Second*30, "The maximum duration for search query execution")
maxQueryLen = flag.Int("search.maxQueryLen", 16*1024, "The maximum search query length in bytes")
maxLookback = flag.Duration("search.maxLookback", 0, "Synonim to -search.lookback-delta from Prometheus. "+
"The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg")
"The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. "+
"See also '-search.maxStalenessInterval' flag, which has the same meaining due to historical reasons")
maxStalenessInterval = flag.Duration("search.maxStalenessInterval", 0, "The maximum interval for staleness calculations. "+
"By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning "+
"Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. "+
"See also '-search.maxLookback' flag, which has the same meanining due to historical reasons")
)
// Default step used if not set.
@@ -129,11 +136,12 @@ func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
return err
}
format := r.FormValue("format")
maxRowsPerLine := int(fastfloat.ParseInt64BestEffort(r.FormValue("max_rows_per_line")))
deadline := getDeadlineForExport(r)
if start >= end {
end = start + defaultStep
}
if err := exportHandler(w, matches, start, end, format, deadline); err != nil {
if err := exportHandler(w, matches, start, end, format, maxRowsPerLine, deadline); err != nil {
return fmt.Errorf("error when exporting data for queries=%q on the time range (start=%d, end=%d): %s", matches, start, end, err)
}
exportDuration.UpdateDuration(startTime)
@@ -142,9 +150,37 @@ func ExportHandler(startTime time.Time, w http.ResponseWriter, r *http.Request)
var exportDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/export"}`)
func exportHandler(w http.ResponseWriter, matches []string, start, end int64, format string, deadline netstorage.Deadline) error {
func exportHandler(w http.ResponseWriter, matches []string, start, end int64, format string, maxRowsPerLine int, deadline netstorage.Deadline) error {
writeResponseFunc := WriteExportStdResponse
writeLineFunc := WriteExportJSONLine
if maxRowsPerLine > 0 {
writeLineFunc = func(w io.Writer, rs *netstorage.Result) {
valuesOrig := rs.Values
timestampsOrig := rs.Timestamps
values := valuesOrig
timestamps := timestampsOrig
for len(values) > 0 {
var valuesChunk []float64
var timestampsChunk []int64
if len(values) > maxRowsPerLine {
valuesChunk = values[:maxRowsPerLine]
timestampsChunk = timestamps[:maxRowsPerLine]
values = values[maxRowsPerLine:]
timestamps = timestamps[maxRowsPerLine:]
} else {
valuesChunk = values
timestampsChunk = timestamps
values = nil
timestamps = nil
}
rs.Values = valuesChunk
rs.Timestamps = timestampsChunk
WriteExportJSONLine(w, rs)
}
rs.Values = valuesOrig
rs.Timestamps = timestampsOrig
}
}
contentType := "application/stream+json"
if format == "prometheus" {
contentType = "text/plain"
@@ -283,12 +319,18 @@ func labelValuesWithMatches(labelName string, matches []string, start, end int64
if err != nil {
return nil, err
}
for i, tfs := range tagFilterss {
// Add `labelName!=''` tag filter in order to filter out series without the labelName.
tagFilterss[i] = append(tfs, storage.TagFilter{
Key: []byte(labelName),
IsNegative: true,
})
// Add `labelName!=''` tag filter in order to filter out series without the labelName.
// There is no need in adding `__name__!=''` filter, since all the time series should
// already have non-empty name.
if labelName != "__name__" {
key := []byte(labelName)
for i, tfs := range tagFilterss {
tagFilterss[i] = append(tfs, storage.TagFilter{
Key: key,
IsNegative: true,
})
}
}
if start >= end {
end = start + defaultStep
@@ -335,7 +377,6 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
if err != nil {
return fmt.Errorf(`cannot obtain label entries: %s`, err)
}
w.Header().Set("Content-Type", "application/json")
WriteLabelsCountResponse(w, labelEntries)
labelsCountDuration.UpdateDuration(startTime)
@@ -344,6 +385,52 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ
var labelsCountDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/labels/count"}`)
const secsPerDay = 3600 * 24
// TSDBStatusHandler processes /api/v1/status/tsdb request.
//
// See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats
func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
deadline := getDeadlineForQuery(r)
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
}
date := time.Now().Unix() / secsPerDay
dateStr := r.FormValue("date")
if len(dateStr) > 0 {
t, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
}
date = t.Unix() / secsPerDay
}
topN := 10
topNStr := r.FormValue("topN")
if len(topNStr) > 0 {
n, err := strconv.Atoi(topNStr)
if err != nil {
return fmt.Errorf("cannot parse `topN` arg %q: %s", topNStr, err)
}
if n <= 0 {
n = 1
}
if n > 1000 {
n = 1000
}
topN = n
}
status, err := netstorage.GetTSDBStatusForDate(deadline, uint64(date), topN)
if err != nil {
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
}
w.Header().Set("Content-Type", "application/json")
WriteTSDBStatusResponse(w, status)
tsdbStatusDuration.UpdateDuration(startTime)
return nil
}
var tsdbStatusDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/status/tsdb"}`)
// LabelsHandler processes /api/v1/labels request.
//
// See https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names
@@ -539,20 +626,23 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
if err != nil {
return err
}
queryOffset := getLatencyOffsetMilliseconds()
step, err := getDuration(r, "step", queryOffset)
if err != nil {
return err
}
deadline := getDeadlineForQuery(r)
lookbackDelta, err := getMaxLookback(r)
if err != nil {
return err
}
step, err := getDuration(r, "step", lookbackDelta)
if err != nil {
return err
}
if step <= 0 {
step = defaultStep
}
deadline := getDeadlineForQuery(r)
if len(query) > *maxQueryLen {
return fmt.Errorf("too long query; got %d bytes; mustn't exceed `-search.maxQueryLen=%d` bytes", len(query), *maxQueryLen)
}
queryOffset := getLatencyOffsetMilliseconds()
if !getBool(r, "nocache") && ct-start < queryOffset {
// Adjust start time only if `nocache` arg isn't set.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/241
@@ -570,7 +660,7 @@ func QueryHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) e
start -= offset
end := start
start = end - window
if err := exportHandler(w, []string{childQuery}, start, end, "promapi", deadline); err != nil {
if err := exportHandler(w, []string{childQuery}, start, end, "promapi", 0, deadline); err != nil {
return fmt.Errorf("error when exporting data for query=%q on the time range (start=%d, end=%d): %s", childQuery, start, end, err)
}
queryDuration.UpdateDuration(startTime)
@@ -803,7 +893,15 @@ func getTime(r *http.Request, argKey string, defaultValue int64) (int64, error)
case prometheusMaxTimeFormatted:
return maxTimeMsecs, nil
}
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
// Try parsing duration relative to the current time
d, err1 := time.ParseDuration(argValue)
if err1 != nil {
return 0, fmt.Errorf("cannot parse %q=%q: %s", argKey, argValue, err)
}
if d > 0 {
d = -d
}
t = time.Now().Add(d)
}
secs = float64(t.UnixNano()) / 1e9
}
@@ -854,17 +952,20 @@ func getDuration(r *http.Request, argKey string, defaultValue int64) (int64, err
const maxDurationMsecs = 100 * 365 * 24 * 3600 * 1000
func getMaxLookback(r *http.Request) (int64, error) {
d := int64(*maxLookback / time.Millisecond)
d := maxLookback.Milliseconds()
if d == 0 {
d = maxStalenessInterval.Milliseconds()
}
return getDuration(r, "max_lookback", d)
}
func getDeadlineForQuery(r *http.Request) netstorage.Deadline {
dMax := int64(maxQueryDuration.Seconds() * 1e3)
dMax := maxQueryDuration.Milliseconds()
return getDeadlineWithMaxDuration(r, dMax, "-search.maxQueryDuration")
}
func getDeadlineForExport(r *http.Request) netstorage.Deadline {
dMax := int64(maxExportDuration.Seconds() * 1e3)
dMax := maxExportDuration.Milliseconds()
return getDeadlineWithMaxDuration(r, dMax, "-search.maxExportDuration")
}
@@ -907,7 +1008,7 @@ func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error)
}
func getLatencyOffsetMilliseconds() int64 {
d := int64(*latencyOffset / time.Millisecond)
d := latencyOffset.Milliseconds()
if d <= 1000 {
d = 1000
}

View File

@@ -0,0 +1,28 @@
{% import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" %}
{% stripspace %}
TSDBStatusResponse generates response for /api/v1/status/tsdb .
{% func TSDBStatusResponse(status *storage.TSDBStatus) %}
{
"status":"success",
"data":{
"seriesCountByMetricName":{%= tsdbStatusEntries(status.SeriesCountByMetricName) %},
"labelValueCountByLabelName":{%= tsdbStatusEntries(status.LabelValueCountByLabelName) %},
"seriesCountByLabelValuePair":{%= tsdbStatusEntries(status.SeriesCountByLabelValuePair) %}
}
}
{% endfunc %}
{% func tsdbStatusEntries(a []storage.TopHeapEntry) %}
[
{% for i, e := range a %}
{
"name":{%q= e.Name %},
"value":{%d= int(e.Count) %}
}
{% if i+1 < len(a) %},{% endif %}
{% endfor %}
]
{% endfunc %}
{% endstripspace %}

View File

@@ -0,0 +1,123 @@
// Code generated by qtc from "tsdb_status_response.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vmselect/prometheus/tsdb_status_response.qtpl:1
package prometheus
//line app/vmselect/prometheus/tsdb_status_response.qtpl:1
import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
// TSDBStatusResponse generates response for /api/v1/status/tsdb .
//line app/vmselect/prometheus/tsdb_status_response.qtpl:5
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:5
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:5
func StreamTSDBStatusResponse(qw422016 *qt422016.Writer, status *storage.TSDBStatus) {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:5
qw422016.N().S(`{"status":"success","data":{"seriesCountByMetricName":`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:9
streamtsdbStatusEntries(qw422016, status.SeriesCountByMetricName)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:9
qw422016.N().S(`,"labelValueCountByLabelName":`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:10
streamtsdbStatusEntries(qw422016, status.LabelValueCountByLabelName)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:10
qw422016.N().S(`,"seriesCountByLabelValuePair":`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:11
streamtsdbStatusEntries(qw422016, status.SeriesCountByLabelValuePair)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:11
qw422016.N().S(`}}`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
func WriteTSDBStatusResponse(qq422016 qtio422016.Writer, status *storage.TSDBStatus) {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
StreamTSDBStatusResponse(qw422016, status)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
qt422016.ReleaseWriter(qw422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
func TSDBStatusResponse(status *storage.TSDBStatus) string {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
WriteTSDBStatusResponse(qb422016, status)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
qs422016 := string(qb422016.B)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
return qs422016
//line app/vmselect/prometheus/tsdb_status_response.qtpl:14
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:16
func streamtsdbStatusEntries(qw422016 *qt422016.Writer, a []storage.TopHeapEntry) {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:16
qw422016.N().S(`[`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:18
for i, e := range a {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:18
qw422016.N().S(`{"name":`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:20
qw422016.N().Q(e.Name)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:20
qw422016.N().S(`,"value":`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:21
qw422016.N().D(int(e.Count))
//line app/vmselect/prometheus/tsdb_status_response.qtpl:21
qw422016.N().S(`}`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:23
if i+1 < len(a) {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:23
qw422016.N().S(`,`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:23
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:24
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:24
qw422016.N().S(`]`)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
func writetsdbStatusEntries(qq422016 qtio422016.Writer, a []storage.TopHeapEntry) {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
streamtsdbStatusEntries(qw422016, a)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
qt422016.ReleaseWriter(qw422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
}
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
func tsdbStatusEntries(a []storage.TopHeapEntry) string {
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
writetsdbStatusEntries(qb422016, a)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
qs422016 := string(qb422016.B)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
return qs422016
//line app/vmselect/prometheus/tsdb_status_response.qtpl:26
}

View File

@@ -8,9 +8,9 @@ import (
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/histogram"
)

View File

@@ -5,7 +5,7 @@ import (
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/metricsql"
)
// callbacks for optimized incremental calculations for aggregate functions

View File

@@ -8,7 +8,7 @@ import (
"sync"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/metricsql"
)
func TestIncrementalAggr(t *testing.T) {

View File

@@ -1,5 +0,0 @@
package promql
import "unsafe"
const maxByteSliceLen = 1<<(31+9*(unsafe.Sizeof(int(0))/8)) - 1

View File

@@ -6,9 +6,9 @@ import (
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql/binaryop"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metricsql"
"github.com/VictoriaMetrics/metricsql/binaryop"
)
var binaryOpFuncs = map[string]binaryOpFunc{

View File

@@ -11,9 +11,9 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
)
var (
@@ -668,9 +668,12 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
if iafc.ae.Modifier.Op != "" {
// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
// since each group can have own set of time series in memory.
// Estimate the number of such groups is lower than 1000 :)
timeseriesLen *= 1000
}
// The maximum number of output time series is limited by rssLen.
if timeseriesLen > rssLen {
timeseriesLen = rssLen
}
}
rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
rollupMemorySize := mulNoOverflow(rollupPoints, 16)
@@ -680,7 +683,7 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
return nil, fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series; "+
"possible solutions are: reducing the number of matching time series; switching to node with more RAM; "+
"increasing -memory.allowedPercent; increasing `step` query arg (%gs)",
rollupPoints, rssLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
}
defer rml.Put(uint64(rollupMemorySize))

View File

@@ -11,8 +11,8 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
)
var logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging")

View File

@@ -338,7 +338,7 @@ func TestExecSuccess(t *testing.T) {
q := `timestamp(123)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Values: []float64{900, 1100, 1300, 1500, 1700, 1900},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@@ -349,7 +349,7 @@ func TestExecSuccess(t *testing.T) {
q := `timestamp(time())`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Values: []float64{900, 1100, 1300, 1500, 1700, 1900},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@@ -360,7 +360,7 @@ func TestExecSuccess(t *testing.T) {
q := `timestamp(456/time()+123)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Values: []float64{900, 1100, 1300, 1500, 1700, 1900},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@@ -371,7 +371,7 @@ func TestExecSuccess(t *testing.T) {
q := `timestamp(time()>=1600)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{nan, nan, nan, 1600, 1800, 2000},
Values: []float64{nan, nan, nan, nan, 1700, 1900},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@@ -974,6 +974,65 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`label_map(match)`, func(t *testing.T) {
t.Parallel()
q := `sort(label_map((
label_set(time(), "label", "v1"),
label_set(time()+100, "label", "v2"),
label_set(time()+200, "label", "v3"),
label_set(time()+300, "x", "y"),
label_set(time()+400, "label", "v4"),
), "label", "v1", "foo", "v2", "bar", "", "qwe", "v4", ""))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{{
Key: []byte("label"),
Value: []byte("foo"),
}}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1100, 1300, 1500, 1700, 1900, 2100},
Timestamps: timestampsExpected,
}
r2.MetricName.Tags = []storage.Tag{{
Key: []byte("label"),
Value: []byte("bar"),
}}
r3 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1200, 1400, 1600, 1800, 2000, 2200},
Timestamps: timestampsExpected,
}
r3.MetricName.Tags = []storage.Tag{{
Key: []byte("label"),
Value: []byte("v3"),
}}
r4 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1300, 1500, 1700, 1900, 2100, 2300},
Timestamps: timestampsExpected,
}
r4.MetricName.Tags = []storage.Tag{
{
Key: []byte("label"),
Value: []byte("qwe"),
},
{
Key: []byte("x"),
Value: []byte("y"),
},
}
r5 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1400, 1600, 1800, 2000, 2200, 2400},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r1, r2, r3, r4, r5}
f(q, resultExpected)
})
t.Run(`label_copy(new_tag)`, func(t *testing.T) {
t.Parallel()
q := `label_copy(
@@ -5371,6 +5430,8 @@ func TestExecError(t *testing.T) {
f(`label_transform(1)`)
f(`label_set()`)
f(`label_set(1, "foo")`)
f(`label_map()`)
f(`label_map(1)`)
f(`label_del()`)
f(`label_keep()`)
f(`label_match()`)

View File

@@ -3,8 +3,8 @@ package promql
import (
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metricsql"
)
// IsRollup verifies whether s is a rollup with non-empty window.

View File

@@ -1,6 +1,7 @@
package promql
import (
"flag"
"fmt"
"math"
"strings"
@@ -8,12 +9,16 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/histogram"
)
var minStalenessInterval = flag.Duration("search.minStalenessInterval", 0, "The mimimum interval for staleness calculations. "+
"This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. "+
"See also '-search.maxStalenessInterval'")
var rollupFuncs = map[string]newRollupFunc{
// Standard rollup funcs from PromQL.
// See funcs accepting range-vector on https://prometheus.io/docs/prometheus/latest/querying/functions/ .
@@ -67,6 +72,11 @@ var rollupFuncs = map[string]newRollupFunc{
"aggr_over_time": newRollupFuncTwoArgs(rollupFake),
"hoeffding_bound_upper": newRollupHoeffdingBoundUpper,
"hoeffding_bound_lower": newRollupHoeffdingBoundLower,
// `timestamp` function must return timestamp for the last datapoint on the current window
// in order to properly handle offset and timestamps unaligned to the current step.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/415 for details.
"timestamp": newRollupFuncOneArg(rollupTimestamp),
}
// rollupAggrFuncs are functions that can be passed to `aggr_over_time()`
@@ -148,6 +158,8 @@ var rollupFuncsKeepMetricGroup = map[string]bool{
"geomean_over_time": true,
"hoeffding_bound_lower": true,
"hoeffding_bound_upper": true,
"first_over_time": true,
"last_over_time": true,
}
func getRollupAggrFuncNames(expr metricsql.Expr) ([]string, error) {
@@ -444,6 +456,11 @@ func (rc *rollupConfig) doInternal(dstValues []float64, tsm *timeseriesMap, valu
if rc.LookbackDelta > 0 && maxPrevInterval > rc.LookbackDelta {
maxPrevInterval = rc.LookbackDelta
}
if *minStalenessInterval > 0 {
if msi := minStalenessInterval.Milliseconds(); msi > 0 && maxPrevInterval < msi {
maxPrevInterval = msi
}
}
window := rc.Window
if window <= 0 {
window = rc.Step
@@ -1497,6 +1514,19 @@ func rollupLow(rfa *rollupFuncArg) float64 {
return min
}
func rollupTimestamp(rfa *rollupFuncArg) float64 {
// There is no need in handling NaNs here, since they must be cleaned up
// before calling rollup funcs.
timestamps := rfa.timestamps
if len(timestamps) == 0 {
// Do not take into account rfa.prevTimestamp, since it may lead
// to inconsistent results comparing to Prometheus on broken time series
// with irregular data points.
return nan
}
return float64(timestamps[len(timestamps)-1]) / 1e3
}
func rollupFirst(rfa *rollupFuncArg) float64 {
// There is no need in handling NaNs here, since they must be cleaned up
// before calling rollup funcs.

View File

@@ -12,10 +12,10 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
"github.com/VictoriaMetrics/fastcache"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
)
var (

View File

@@ -3,8 +3,8 @@ package promql
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metricsql"
)
func TestRollupResultCache(t *testing.T) {

View File

@@ -4,7 +4,7 @@ import (
"math"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/metricsql"
)
var (

View File

@@ -2,6 +2,7 @@ package promql
import (
"fmt"
"reflect"
"sort"
"strconv"
"sync"
@@ -168,7 +169,7 @@ func (ts *timeseries) marshalFastNoTimestamps(dst []byte) []byte {
// during marshalFastTimestamps.
var valuesBuf []byte
if len(ts.Values) > 0 {
valuesBuf = (*[maxByteSliceLen]byte)(unsafe.Pointer(&ts.Values[0]))[:len(ts.Values)*8]
valuesBuf = float64ToByteSlice(ts.Values)
}
dst = append(dst, valuesBuf...)
return dst
@@ -178,7 +179,7 @@ func marshalFastTimestamps(dst []byte, timestamps []int64) []byte {
dst = encoding.MarshalUint32(dst, uint32(len(timestamps)))
var timestampsBuf []byte
if len(timestamps) > 0 {
timestampsBuf = (*[maxByteSliceLen]byte)(unsafe.Pointer(&timestamps[0]))[:len(timestamps)*8]
timestampsBuf = int64ToByteSlice(timestamps)
}
dst = append(dst, timestampsBuf...)
return dst
@@ -199,8 +200,7 @@ func unmarshalFastTimestamps(src []byte) ([]byte, []int64, error) {
if len(src) < bufSize {
return src, nil, fmt.Errorf("cannot unmarshal timestamps; got %d bytes; want at least %d bytes", len(src), bufSize)
}
timestamps := (*[maxByteSliceLen / 8]int64)(unsafe.Pointer(&src[0]))[:timestampsCount]
timestamps = timestamps[:len(timestamps):len(timestamps)]
timestamps := byteSliceToInt64(src[:bufSize])
src = src[bufSize:]
return src, timestamps, nil
@@ -229,12 +229,43 @@ func (ts *timeseries) unmarshalFastNoTimestamps(src []byte) ([]byte, error) {
if len(src) < bufSize {
return src, fmt.Errorf("cannot unmarshal values; got %d bytes; want at least %d bytes", len(src), bufSize)
}
values := (*[maxByteSliceLen / 8]float64)(unsafe.Pointer(&src[0]))[:valuesCount]
ts.Values = values[:len(values):len(values)]
ts.Values = byteSliceToFloat64(src[:bufSize])
return src[bufSize:], nil
}
func float64ToByteSlice(a []float64) (b []byte) {
sh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
sh.Data = uintptr(unsafe.Pointer(&a[0]))
sh.Len = len(a) * int(unsafe.Sizeof(a[0]))
sh.Cap = sh.Len
return
}
func int64ToByteSlice(a []int64) (b []byte) {
sh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
sh.Data = uintptr(unsafe.Pointer(&a[0]))
sh.Len = len(a) * int(unsafe.Sizeof(a[0]))
sh.Cap = sh.Len
return
}
func byteSliceToInt64(b []byte) (a []int64) {
sh := (*reflect.SliceHeader)(unsafe.Pointer(&a))
sh.Data = uintptr(unsafe.Pointer(&b[0]))
sh.Len = len(b) / int(unsafe.Sizeof(a[0]))
sh.Cap = sh.Len
return
}
func byteSliceToFloat64(b []byte) (a []float64) {
sh := (*reflect.SliceHeader)(unsafe.Pointer(&a))
sh.Data = uintptr(unsafe.Pointer(&b[0]))
sh.Len = len(b) / int(unsafe.Sizeof(a[0]))
sh.Cap = sh.Len
return
}
// unmarshalMetricNameFast unmarshals mn from src, so mn members
// hold references to src.
//

View File

@@ -12,8 +12,8 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/histogram"
)
@@ -53,12 +53,13 @@ var transformFuncs = map[string]transformFunc{
"sort_desc": newTransformFuncSort(true),
"sqrt": newTransformFuncOneArg(transformSqrt),
"time": transformTime,
"timestamp": transformTimestamp,
"vector": transformVector,
"year": newTransformFuncDateTime(transformYear),
// "timestamp" has been moved to rollup funcs. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/415
"vector": transformVector,
"year": newTransformFuncDateTime(transformYear),
// New funcs
"label_set": transformLabelSet,
"label_map": transformLabelMap,
"label_del": transformLabelDel,
"label_keep": transformLabelKeep,
"label_copy": transformLabelCopy,
@@ -1026,6 +1027,38 @@ func transformLabelSet(tfa *transformFuncArg) ([]*timeseries, error) {
return rvs, nil
}
func transformLabelMap(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if len(args) < 2 {
return nil, fmt.Errorf(`not enough args; got %d; want at least %d`, len(args), 2)
}
label, err := getString(args[1], 1)
if err != nil {
return nil, fmt.Errorf("cannot read label name: %s", err)
}
srcValues, dstValues, err := getStringPairs(args[2:])
if err != nil {
return nil, err
}
m := make(map[string]string, len(srcValues))
for i, srcValue := range srcValues {
m[srcValue] = dstValues[i]
}
rvs := args[0]
for _, ts := range rvs {
mn := &ts.MetricName
dstValue := getDstValue(mn, label)
value, ok := m[string(*dstValue)]
if ok {
*dstValue = append((*dstValue)[:0], value...)
}
if len(*dstValue) == 0 {
mn.RemoveTag(label)
}
}
return rvs, nil
}
func transformLabelCopy(tfa *transformFuncArg) ([]*timeseries, error) {
return transformLabelCopyExt(tfa, false)
}
@@ -1483,25 +1516,6 @@ func transformTime(tfa *transformFuncArg) ([]*timeseries, error) {
return evalTime(tfa.ec), nil
}
func transformTimestamp(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 1); err != nil {
return nil, err
}
rvs := args[0]
for _, ts := range rvs {
ts.MetricName.ResetMetricGroup()
values := ts.Values
for i, t := range ts.Timestamps {
v := values[i]
if !math.IsNaN(v) {
values[i] = float64(t) / 1e3
}
}
}
return rvs, nil
}
func transformVector(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 1); err != nil {

View File

@@ -9,6 +9,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
@@ -119,6 +120,14 @@ func SearchTagEntries(maxTagKeys, maxTagValues int) ([]storage.TagEntry, error)
return tagEntries, err
}
// GetTSDBStatusForDate returns TSDB status for the given date.
func GetTSDBStatusForDate(date uint64, topN int) (*storage.TSDBStatus, error) {
WG.Add(1)
status, err := Storage.GetTSDBStatusForDate(date, topN)
WG.Done()
return status, err
}
// GetSeriesCount returns the number of time series in the storage.
func GetSeriesCount() (uint64, error) {
WG.Add(1)
@@ -162,9 +171,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshotPath, err := Storage.CreateSnapshot()
if err != nil {
msg := fmt.Sprintf("cannot create snapshot: %s", err)
logger.Errorf("%s", msg)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
err = fmt.Errorf("cannot create snapshot: %s", err)
jsonResponseError(w, err)
return true
}
if prometheusCompatibleResponse {
@@ -177,9 +185,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshots, err := Storage.ListSnapshots()
if err != nil {
msg := fmt.Sprintf("cannot list snapshots: %s", err)
logger.Errorf("%s", msg)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
err = fmt.Errorf("cannot list snapshots: %s", err)
jsonResponseError(w, err)
return true
}
fmt.Fprintf(w, `{"status":"ok","snapshots":[`)
@@ -195,9 +202,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshotName := r.FormValue("snapshot")
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err)
logger.Errorf("%s", msg)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
jsonResponseError(w, err)
return true
}
fmt.Fprintf(w, `{"status":"ok"}`)
@@ -206,16 +212,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
w.Header().Set("Content-Type", "application/json")
snapshots, err := Storage.ListSnapshots()
if err != nil {
msg := fmt.Sprintf("cannot list snapshots: %s", err)
logger.Errorf("%s", msg)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
err = fmt.Errorf("cannot list snapshots: %s", err)
jsonResponseError(w, err)
return true
}
for _, snapshotName := range snapshots {
if err := Storage.DeleteSnapshot(snapshotName); err != nil {
msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err)
logger.Errorf("%s", msg)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
err = fmt.Errorf("cannot delete snapshot %q: %s", snapshotName, err)
jsonResponseError(w, err)
return true
}
}
@@ -252,6 +256,10 @@ func registerStorageMetrics() {
return &sm.IndexDBMetrics
}
metrics.NewGauge(fmt.Sprintf(`vm_free_disk_space_bytes{path=%q}`, *DataPath), func() float64 {
return float64(fs.MustGetFreeSpace(*DataPath))
})
metrics.NewGauge(`vm_active_merges{type="storage/big"}`, func() float64 {
return float64(tm().ActiveBigMerges)
})
@@ -374,6 +382,10 @@ func registerStorageMetrics() {
return float64(idbm().SizeBytes)
})
metrics.NewGauge(`vm_deduplicated_samples_total{type="merge"}`, func() float64 {
return float64(m().DedupsDuringMerge)
})
metrics.NewGauge(`vm_rows_ignored_total{reason="big_timestamp"}`, func() float64 {
return float64(m().TooBigTimestampRows)
})
@@ -563,3 +575,9 @@ func registerStorageMetrics() {
return float64(m().MetricNameCacheCollisions)
})
}
func jsonResponseError(w http.ResponseWriter, err error) {
logger.Errorf("%s", err)
w.WriteHeader(http.StatusInternalServerError)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, err)
}

Some files were not shown because too many files have changed in this diff Show More