Compare commits

...

878 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
8c568b13b2 docs/CHANGELOG.md: cut v1.53.0 2021-02-03 03:42:31 +02:00
Aliaksandr Valialkin
7388479a07 deployment/docker: update base alpine image from v3.13.0 to v3.13.1
See release notes for v3.13.1 - https://www.alpinelinux.org/posts/Alpine-3.13.1-released.html
2021-02-03 03:40:28 +02:00
Aliaksandr Valialkin
157c02622b app/vmselect: add ability to set Graphite-compatible filter via {__graphite__="foo.*.bar"} syntax 2021-02-03 01:21:54 +02:00
Aliaksandr Valialkin
4068f8d590 lib/promscrape: add vm_promscrape_service_discovery_duration_seconds metric 2021-02-02 16:15:25 +02:00
Aliaksandr Valialkin
bd11fd8f1d lib/promscrape: add vm_promscrape_scrape_retries_total, vm_promscrape_discovery_retries_total and vm_promscrape_discovery_requests_total metrics 2021-02-01 20:06:27 +02:00
Aliaksandr Valialkin
b577cdd855 docs: increase heading sizes in vmagent, vmauth, vmbackup and vmrestore docs, so they match the heading sizes in VictoriaMetrics docs 2021-02-01 19:44:00 +02:00
Aliaksandr Valialkin
b39d5ef656 vendor: make vendor-update 2021-02-01 19:39:10 +02:00
Aliaksandr Valialkin
8164cd8932 docs/vmctl.md: update build instructions after the migration from github.com/VictoriaMetrics/vmctl to github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl 2021-02-01 19:39:08 +02:00
Aliaksandr Valialkin
b43b498fd8 app/vmselect: add ability to pass extra_label=<label>=<value> query arg to Prometheus Querying API
This enforced `{label="value"}` label filter to the query.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1021
2021-02-01 18:04:17 +02:00
Aliaksandr Valialkin
5d87dbfd65 docs: document ability to query Graphite datasource from vmalert 2021-02-01 15:26:33 +02:00
Nikolay
195341a7cf Graphite vmalert wip (#112)
* init implementation for graphite alerts

* adds graphite support for vmalert

* small fix

* changes vmalert graphite api with type

* updates tests

* small fix

* fixes graphite parse

* Fixes graphite from time
2021-02-01 15:05:32 +02:00
Aliaksandr Valialkin
f0087f0dbb lib/flagutil: typo fix in comment to ArrayInt.GetOptionalArgOrDefault() func 2021-02-01 14:35:39 +02:00
Aliaksandr Valialkin
a4ae945a79 app/victoria-metrics: fix tests after 8749c2dd92 2021-02-01 14:34:11 +02:00
Aliaksandr Valialkin
b2aa80e74b app/vmagent: add -remoteWrite.roundDigits command-line option for limiting the number of digits after the point for stored values
This commit also adds --vm-round-digits command-line option to vmctl tool.
2021-02-01 14:27:09 +02:00
Aliaksandr Valialkin
29a7067827 app/vmctl: fix make check-all warnings 2021-02-01 01:31:25 +02:00
Aliaksandr Valialkin
d5c180e680 app/vmctl: move vmctl code from github.com/VictoriaMetrics/vmctl
It is better developing vmctl tool in VictoriaMetrics repository, so it could be released
together with the rest of vmutils tools such as vmalert, vmagent, vmbackup, vmrestore and vmauth.
2021-02-01 01:10:20 +02:00
Aliaksandr Valialkin
2a7b1cc668 docs/Cluster-VictoriaMetrics.md: mention about -search.denyPartialResponse command-line flag and deny_partial_response query arg 2021-01-27 14:07:00 +02:00
Aliaksandr Valialkin
929f09b90d docs/CHANGELOG.md: typo fixes 2021-01-27 01:18:48 +02:00
Aliaksandr Valialkin
d6347a3e56 lib/logger: initialize timezone by UTC in order to fix failing tests 2021-01-27 00:59:12 +02:00
Aliaksandr Valialkin
fc5b26d856 lib/promscrape: export vm_promscrape_scrapes_failed_per_url_total and vm_promscrape_scrapes_skipped_by_sample_limit_per_url_total metrics
These metrics could be useful for determining imporperly working scrape targets.
Note that these metrics are exported only for failing scrape targets. They aren't exposed for normally working targets.
2021-01-27 00:39:26 +02:00
Aliaksandr Valialkin
de3c662e8a all: consistently use timers from timerpool 2021-01-27 00:39:26 +02:00
Aliaksandr Valialkin
3149ac7a7e lib/fs: properly initialize cleaner for pageCache bitmaps
Previously it wasnt working because the timer was fired only once
2021-01-27 00:39:26 +02:00
Aliaksandr Valialkin
419ad74269 app/vmagent: add -remoteWrite.rateLimit command-line flag for limiting data rate to remote storage
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1035
2021-01-27 00:39:26 +02:00
Aliaksandr Valialkin
3fe848cdd7 lib/logger: add -loggerTimezone command-line flag for adjusting timezone for timestamps in log messages 2021-01-26 22:51:54 +02:00
Aliaksandr Valialkin
5481906db6 docs/CHANGELOG.md: mention about https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1027 2021-01-26 16:37:36 +02:00
weng zhao
cc3e69e963 vmalert: add option datasource.queryStep to allow user to address the inconsistency between grafana dashboards(query_range with step 15s usually) and ALERTS (#1027)
Co-authored-by: zhao.weng <zhao.weng@shopee.com>
2021-01-26 08:12:04 +00:00
Aliaksandr Valialkin
8cea3c3cc4 lib/promscrape: retry scrape and service discovery requests when the remote server closes http keep-alive connection 2021-01-22 13:22:33 +02:00
Aliaksandr Valialkin
c164a8d231 app/vmselect/promql: improve documentation for -search.maxPointsPertimeseries command-line flag
This should reduce incorrect usage and assumptions for this flag.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1020
2021-01-22 13:00:10 +02:00
Aliaksandr Valialkin
3caac3d12c docs/CHANGELOG.md: mention about the fix with too big HTTP reconnection rate to targets
This has been fixed in 0a45220b0a
2021-01-22 12:09:16 +02:00
Aliaksandr Valialkin
054fe1c198 deployment/docker: update Go builder from v1.15.6 to v1.15.7
See https://groups.google.com/g/golang-nuts/c/ufLjEY_AJ0I/m/smSHpGXiDQAJ for details
2021-01-21 18:39:49 +02:00
Aliaksandr Valialkin
0a45220b0a vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.11 to v1.0.12 2021-01-21 12:00:21 +02:00
Aliaksandr Valialkin
8749c2dd92 app/vmselect: add -search.maxStepForPointsAdjustment command-line flag, which can be used for disabling adjustment for points returned from /api/v1/query_range handler if they have timestamps closer than -search.latencyOffset to the current time 2021-01-19 22:56:32 +02:00
Aliaksandr Valialkin
011c5da785 app/vmselect/graphite: extract getCanonicalPath() function from loop body inside getCanonicalPaths() 2021-01-18 17:30:26 +02:00
Aliaksandr Valialkin
fcbefc15d0 LICENSE: bump the last year from 2020 to 2021 2021-01-16 13:00:16 +02:00
Aliaksandr Valialkin
485d43ef21 deployment/docker: upgrade alpine base Docker image from v3.12.3 to v3.13.0
See release notes for v3.13.0 - https://www.alpinelinux.org/posts/Alpine-3.13.0-released.html
2021-01-15 22:50:40 +02:00
faceair
b638c1eed5 lib/mergeset: add missing shouldCacheBlock (#1019) 2021-01-15 11:46:01 +02:00
Aliaksandr Valialkin
cc379f95c2 Makefile: add release-victoria-metrics-arm64 build rule 2021-01-13 18:13:18 +02:00
Aliaksandr Valialkin
689d769b4d Makefile: release vmutils for amd64 and arm64
Follow-up for 0d03855787
2021-01-13 18:04:37 +02:00
Robert Edström
0d03855787 Arch consistent filenames (#1015)
* Include individual binary checksums for vmutils

* Consistent archive/binary artefacts between arm64/amd64 for vmutils

* architecture in arhcive, checksums
* not in binaries
2021-01-13 17:31:08 +02:00
Aliaksandr Valialkin
75f7c51cab docs/vmagent.md: follow-up for 184a659c5f 2021-01-13 13:53:14 +02:00
mancubus77
184a659c5f Doco vmagent fix (#1014)
* Update section with remote_write.url for clustered version

* fix typo

Co-authored-by: mancubus <dont@write.me>
2021-01-13 13:50:37 +02:00
Aliaksandr Valialkin
7ce87ebcb2 docs/CHANGELOG.md: cut v1.52.0 2021-01-13 12:58:51 +02:00
Aliaksandr Valialkin
1051d8aa2d app/vmselect/promql: add ability to pass multiple labels to sort_by_label and sort_by_label_desc functions
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/992
2021-01-13 12:44:51 +02:00
Aliaksandr Valialkin
689cf88eb2 vendor: make vendor-update 2021-01-13 12:19:39 +02:00
Aliaksandr Valialkin
bdd0a1cdb2 lib/backup: increase backup chunk size from 128MB to 1GB
This should reduce costs for object storage API calls by 8x. See https://cloud.google.com/storage/pricing#operations-pricing
2021-01-13 12:16:35 +02:00
Aliaksandr Valialkin
acf1a2c72b app/vmselect/promql: properly parse escaped multibyte utf8 code sequences in metric names and labels names
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/990
2021-01-13 10:59:42 +02:00
Aliaksandr Valialkin
89315d719d docs/CHANGELOG.md: document updated extra_label query arg behavior
Follow-up for dc9d7aedd5
2021-01-13 00:58:20 +02:00
Nikolay
dc9d7aedd5 adds extra_label to all import apis (#1007)
* adds extra_label to all import apis,
changes priority for extra_label - now it has priority over original labels

* Update README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>

* Update README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>

* adds extra labels to vmagent  import api
changes order for adding labels, now its added after user values

* adds tests for extra_label

* import fix

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2021-01-13 00:52:50 +02:00
Aliaksandr Valialkin
7373986f9e docs/CHANGELOG.md: mention that the minimum supported TLS version now is v1.2
Follow-up for 7bf5d48315
2021-01-13 00:44:39 +02:00
Nikolay
7bf5d48315 bumps minimal tls version (#1012) 2021-01-13 00:35:47 +02:00
Aliaksandr Valialkin
3e451ccdda docs/Single-server-VictoriaMetrics.md: typo fix 2021-01-12 22:02:55 +02:00
Aliaksandr Valialkin
fe3444b124 deployment/docker: upgrade base image for Docker packages from Alpine 3.13.1 to Alpine 3.12.3 in order to fix potential security issues
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1010
2021-01-12 21:57:01 +02:00
Robert Edström
77be066ee8 add release-vmutils-arm64,release-vmutils-arm64 make targets (#1011) 2021-01-12 21:50:04 +02:00
Aliaksandr Valialkin
1837f2f7d3 app/vmselect/promql: add tfirst_over_time(m[d]) and tlast_over_time(m[d]) MetricsQL functions for returning timestamps for the first and the last samples in m over d 2021-01-12 16:12:12 +02:00
Aliaksandr Valialkin
f5d52b51f1 docs/Articles.md: add https://cer6erus.medium.com/cloud-native-model-driven-telemetry-stack-on-openshift-80712621f5bc 2021-01-12 15:36:27 +02:00
Aliaksandr Valialkin
31ec79eaf6 lib/storage: inline marshalTags function and remove the code for handling duplicate tags from here
This is a follow-up commit after c8ea697db8
2021-01-12 15:13:30 +02:00
Aliaksandr Valialkin
c8ea697db8 lib/storage: de-duplicate tags in MetricName.sortTags
Leave only the last tag among tags with duplicate keys. This is needed for reliable addition of extra_labels
during data ingestion. See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1007 for details.
2021-01-12 15:03:42 +02:00
Aliaksandr Valialkin
2140ccbdcc docs/CHANGELOG.md: document big fixes from the commit 7976c22797 2021-01-12 13:44:17 +02:00
Nikolay
7976c22797 Fixes error handling for promscrape.streamParse (#1009)
properly return error if client cannot read data,
properly suppress scraper errors
2021-01-12 13:31:47 +02:00
Aliaksandr Valialkin
2c44f9989a lib/promscrape: properly show scrape duration on /targets page
Previously it has been shown as 0.000s for any scrape duration.
2021-01-11 21:14:46 +02:00
Aliaksandr Valialkin
e61e3bf174 docs/Single-server-VictoriaMetrics.md: mention about https://github.com/aorfanos/vmalert-cli in Integrations section 2021-01-11 18:52:08 +02:00
Aliaksandr Valialkin
89611fa48c docs/CHANGELOG.md: mention about a bugfix for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/989 2021-01-11 13:11:41 +02:00
Roman Khavronenko
14f0f90507 docker-compose: provide the example list of alerting rules for vm components (#1005)
List contains examples for the alerting rules which might be executed
via `vmalert` to track the health state of VM components. It is assumed
that list will be revised and calibrated for each system individually.
2021-01-11 13:03:15 +02:00
Aliaksandr Valialkin
24ffad74c1 all: use net.Dial instead of fasthttp.Dial, because fasthttp.Dial limits the number of concurrent dials to 1000 2021-01-11 12:53:30 +02:00
Aliaksandr Valialkin
6740294ebb vendor: update github.com/VictoriaMetrics/fasthttp 2021-01-11 12:53:30 +02:00
Roman Khavronenko
2e2e4f7e21 vmalert-989: return non-empty result in template func query stub to pass validation (#1002)
On templates validation stage vmalert does not acutally send queries, so for complex
chained expression validation may fail. To avoid this, we add a blank sample in response
so validation can pass successfully. Later, during the rule execution, stub will be replaced
with real `query` function.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/989
2021-01-10 02:56:11 +03:00
Aliaksandr Valialkin
9dcb18e03d app/vmstorage: disable final merge by default, since it may result in high disk IO and CPU usage without measurable benefits such as increased query performance and reduced disk space usage 2021-01-08 00:16:05 +02:00
Aliaksandr Valialkin
0477991b4d vendor: make vendor-update 2021-01-07 23:55:02 +02:00
Aliaksandr Valialkin
b1f9b39c4b docs/Single-server-VictoriaMetrics.md: sync with upstream 2021-01-07 23:37:31 +02:00
Dan Dascalescu
39b11b3ff4 Tiny typo fix (#997) 2021-01-07 23:35:46 +02:00
Roman Khavronenko
7bd420cbfe docker-compose: add blackhole receiver for alertmanager (#999)
Currently, alertmanager spams logs with `Notify attempt failed, will retry later` message
because default receiver is unreachable. The change updates default configuration with
blackhole receiver which means alertmanager will continue to accept alerts but won't make
attempts to send them anywhere.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/995
2021-01-07 23:33:53 +02:00
Nikolay
85962b459f Snap docs change (#986)
* adds snap docs,
adds release information for snap package,
adds docs notes about configuration management with snap package.

* adds release page mention

* version fix for snap, its awful

* revert version

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-12-29 11:43:09 +02:00
Aliaksandr Valialkin
f6ca776c75 README.md: mention about -search.queryStats.lastQueriesCount and -search.queryStats.minQueryDuration command-line flags in docs about query stats 2020-12-29 11:38:57 +02:00
Aliaksandr Valialkin
70df5f4975 docs/CHANGELOG.md: cut v1.51.0 2020-12-27 14:21:29 +02:00
Aliaksandr Valialkin
c86286ec1d app/vmselect/promql: do not ajdust offset value provided in the query
Previously it could be modified in order to improve response cache hit ratio.
This is unneeded, since cache hit ratio should remain good because the query time range
should be already aligned to multiple of `step` values.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/976
2020-12-27 14:09:25 +02:00
Aliaksandr Valialkin
261535b32d docs/Articles.md: add a link to https://www.percona.com/blog/2020/12/23/observations-on-better-resource-usage-with-percona-monitoring-and-management-v2-12-0/ 2020-12-27 13:01:30 +02:00
Aliaksandr Valialkin
4b7105a65b app/vmselect: sync query stats handling with cluster version 2020-12-27 13:00:29 +02:00
Aliaksandr Valialkin
df0309eae0 app/vmselect/promql: simplify defer call for querystats.RegisterQuery 2020-12-27 12:06:04 +02:00
Aliaksandr Valialkin
ad4e6a9283 app/vmselect/querystats: reduce the default number of last queries to track from 100K to 20K
This should reduce memory usage in constrained environments
2020-12-25 17:40:47 +02:00
Aliaksandr Valialkin
59183f66d0 app/vmselect: refactor /api/v1/stats/top_queries
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/907
2020-12-25 16:44:29 +02:00
Aliaksandr Valialkin
fb338c50a3 app/victoria-metrics: show usage info when incorrect command-line flag is passed to executable 2020-12-25 16:42:21 +02:00
Nikolay
86630350bf Adds query stats handler (#945)
* Adds query stat handler,
for query and query_range api, victoriametrics tracks query execution time,
stats are expored at /api/v1/status/queries endpoint with topN param
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/907

* fixed query stats bugs

* improves queryStats tracker

* improves query stat

* small fix

* fix tests

* added more tests

* fixes 386 tests

* naming fixes

* adds drop for outdated records
2020-12-25 16:42:05 +02:00
Aliaksandr Valialkin
490c69c64e lib/storage: wait for pending transactions before closing and dropping the partition
This deflakes `make test-full-386` test
2020-12-25 11:45:53 +02:00
Aliaksandr Valialkin
932e53522d docs/CHANGELOG.md: mention that vmalert now properly escapes multi-line queries when passing to Grafana
A follow-up for 1de15ad490

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/890
2020-12-25 11:12:06 +02:00
Nikolay
1de15ad490 adds escape for CRLF (#984)
at external.alert.source - \n and \r symbols was url encoded, instead of direct usage.
replace it from "\n" to `\n`  allows to skip url encoding.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/890
2020-12-25 11:03:13 +02:00
Aliaksandr Valialkin
1f2944a9d0 vendor: make vendor-update 2020-12-24 17:19:41 +02:00
Aliaksandr Valialkin
cab7e936a3 lib/storage: physically remove stale parts
Previously they were removed from partition struct, but the corresponding directories weren't removed.

This is a follow-up for 46dba00756
2020-12-24 16:51:36 +02:00
Aliaksandr Valialkin
0326638c90 app/vmalert: typo fix in descriptions for notifier.basicAuth.username and notifier.basicAuth.password command-line flags 2020-12-24 12:48:59 +02:00
Aliaksandr Valialkin
4eb520a342 docs/CHANGELOG.md: mention about adding missing __meta_kubernetes_service_* labels for endpoints and endpointslices roles in kubernetes_sd_config
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/982
2020-12-24 11:33:00 +02:00
Nikolay
b21e16ad0c fixes kubernetes_sd (#983)
* fixes kubernetes_sd,
adds missing service metadata for pod ports without endpoint
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/982

* fix test
2020-12-24 11:26:14 +02:00
Aliaksandr Valialkin
820669da69 lib/promscrape: code prettifying for 8dd03ecf19 2020-12-24 10:56:10 +02:00
Nikolay
8dd03ecf19 adds proxy_url support, (#980)
* adds proxy_url support,
adds proxy_url to the dockerswarm, eureka, kubernetes and consul service discovery,
adds proxy_url to the scrape_config for targets scrapping,
http based proxy is supported atm,
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/503

* fixes imports
2020-12-24 10:52:37 +02:00
Aliaksandr Valialkin
9e4ed5e591 lib/storage: do not remove parts outside the configured retention if they are currently merged
These parts are automatically removed after the merge is complete.
2020-12-24 08:51:28 +02:00
Aliaksandr Valialkin
9df60518bb docs: mention that it is possible to set multiple -notifier.tlsInsecureSkipVerify command-line flags for vmalert
See c3a92968343c2b3619f1ab935702d0e9b3a46733
2020-12-22 22:32:13 +02:00
Nikolay
c270f8f3e6 changes vmalert notifier flag, (#978)
fixes issue with notifier insecure setting, now its possible to use multiple notifier.tlsInsecureSkipVerify multiple time.
2020-12-22 23:23:04 +03:00
Aliaksandr Valialkin
46dba00756 lib/storage: remove stale parts as soon as they go outside the configured retention
Previously such parts could remain undeleted for long durations until they are merged with other parts.
This should help for `-retentionPeriod` values smaller than one month.
2020-12-22 19:54:31 +02:00
Aliaksandr Valialkin
de89bcddae vendor: upgrade github.com/klauspost/compress from v1.11.3 to v1.11.4 2020-12-21 08:56:02 +02:00
Artem Navoiev
0f99c1afb1 add linkedin to release announcement 2020-12-20 20:06:48 +02:00
Artem Navoiev
750daa04d1 Announcement guide 2020-12-19 21:58:03 +02:00
Aliaksandr Valialkin
e4f856e900 vendor: make vendor-update 2020-12-19 17:00:20 +02:00
Aliaksandr Valialkin
e15b20dde3 docs/CHANGELOG.md: cut v1.50.2 2020-12-19 15:32:34 +02:00
Aliaksandr Valialkin
13804bda8f docs/CHANGELOG.md: mention about bugfix for populating template variables in vmalert
See 404cbd1522
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/974
2020-12-19 14:16:04 +02:00
Roman Khavronenko
404cbd1522 vmalert-974: fix order for labels templating (#975)
The change fixes bug caused by 3adf8c5a6f.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/974
2020-12-19 14:10:59 +02:00
Aliaksandr Valialkin
88ac4dfc07 app/vmselect: properly parse negative combined offsets such as -1h2m3s
Previously such offsets were parsed as `-1h + 2m + 3s`. Now they are parsed as `-(1h + 2m + 3s)`.
2020-12-19 01:23:46 +02:00
Aliaksandr Valialkin
17c2ce18fd docs/CHANGELOG.md: there is no visible difference for CPU usage and disk IO usage on production workloads 2020-12-19 00:51:41 +02:00
Aliaksandr Valialkin
d65c03c004 lib/storage: properly determine max rows for output part when merging small parts 2020-12-18 23:14:38 +02:00
Aliaksandr Valialkin
ebf8da3730 lib/{storage,mergeset}: tune background merge process in order to reduce CPU usage and disk IO usage 2020-12-18 20:01:08 +02:00
Aliaksandr Valialkin
e6666da4e7 docs/CHANGELOG.md: mention that Docker images for vmagent, vmalert, vmauth, vmbackup and vmrestore with tags containing -cluster suffix are no longer published
See 441822c4cc for details
2020-12-18 20:01:03 +02:00
Aliaksandr Valialkin
97686ddc65 docs: alphabetically sort links to case studies 2020-12-18 12:30:43 +02:00
Aliaksandr Valialkin
43577a8237 Makefile: force running Makefile rules if there is a file matching their names 2020-12-18 12:20:01 +02:00
Aliaksandr Valialkin
8df25e12d8 docs/Articles.md: classify articles by themes 2020-12-18 12:05:21 +02:00
Aliaksandr Valialkin
d8197f4a55 add a link to https://www.percona.com/blog/2020/12/16/percona-monitoring-and-management-migration-from-prometheus-to-victoriametrics-faq/ 2020-12-18 11:44:55 +02:00
Aliaksandr Valialkin
8aa2f448a8 Upgrade github.com/valyala/gozstd from v1.8.3 to v1.9.0 2020-12-17 15:13:04 +02:00
Aliaksandr Valialkin
2dfa746c91 lib/promscrape: remove ID field from ScrapeWork struct. Use a pointer to ScrapeWork as a key in targetStatusMap
This simplifies the code a bit.
2020-12-17 14:32:56 +02:00
Aliaksandr Valialkin
9abb2d6c74 lib/protoparser/prometheus: follow-up commit after 7d38627b9f6f212ae602aea6a72f469fe3c70ba2
Document the bugfix in docs/CHANGELOG.md and add a test for the bugfix.
2020-12-16 23:40:17 +02:00
BigFish
27f0261257 lib/protoparser/prometheus/parser.go (#970)
fix parse timestamp error if there are some whitespaces after timestamp
2020-12-16 23:36:20 +02:00
Aliaksandr Valialkin
2a1550f341 docs/FAQ.md: add a link to https://valyala.medium.com/promscale-vs-victoriametrics-resource-usage-on-production-workload-91c8e3786c03 in the question about benchmarks 2020-12-16 23:15:08 +02:00
Aliaksandr Valialkin
0d2c4f252f docs/Articles.md: add a link to https://valyala.medium.com/promscale-vs-victoriametrics-resource-usage-on-production-workload-91c8e3786c03 2020-12-16 14:27:58 +02:00
Aliaksandr Valialkin
0e082b1c76 docs/Cluster-VictoriaMetrics.md: fix incorrect commands for profile collecting 2020-12-16 01:07:16 +02:00
Aliaksandr Valialkin
1b9992b42a docs/Cluster-VictoriaMetrics.md: add Profiling section 2020-12-16 01:00:44 +02:00
Aliaksandr Valialkin
795e32be4a docs/CHANGELOG.md: cut v1.50.1 release 2020-12-15 21:11:39 +02:00
Aliaksandr Valialkin
4215182e61 docs/Release-Guide.md: actualize release docs 2020-12-15 21:10:43 +02:00
Aliaksandr Valialkin
e8f645bf52 docs/CHANGELOG.md: mention about bugfix, which properly removes unregistered targets from /targets page
See bugfix at a4c7fcb5e1
2020-12-15 21:06:59 +02:00
Aliaksandr Valialkin
a4c7fcb5e1 lib/promscrape: properly remove deleted target from /targets page
Previously `sw` variable wasn't captured correctly by the started goroutine.
2020-12-15 20:57:09 +02:00
Aliaksandr Valialkin
aa56b9217e app/vmagent: add vmagent_remotewrite_blocks_sent_total and vmagent_remotewrite_bytes_sent_total metrics per each -remoteWrite.url 2020-12-15 20:39:57 +02:00
Aliaksandr Valialkin
b10ad44692 docs/vmagent.md: typo fix: pearsed->parsed 2020-12-15 19:03:01 +02:00
Aliaksandr Valialkin
1eabbc0e27 docs/vmagent.md: mention that sample_limit option has no sense when stream parsing is enabled 2020-12-15 18:43:41 +02:00
Aliaksandr Valialkin
a13a443bf7 docs/CHANGELOG.md: cut v1.50.0 release 2020-12-15 14:44:42 +02:00
Aliaksandr Valialkin
b9913e151a .github/workflows/main.yml: fall back to go get instead of go install for installing aux tools
It is unclear why `go install` doesn't work in Github Actions. Needs additional investigation.
The following error is returned now:

cannot find package "golang.org/x/lint/golint" in any of:
	/opt/hostedtoolcache/go/1.15.5/x64/src/golang.org/x/lint/golint (from $GOROOT)
	/home/runner/go/src/golang.org/x/lint/golint (from $GOPATH)
2020-12-15 14:17:52 +02:00
Aliaksandr Valialkin
b730fc2667 lib/promscrape: properly handle scrape errors when stream parsing is enabled
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/967
2020-12-15 14:08:28 +02:00
Aliaksandr Valialkin
11fa458e39 app/vmselect/promql: return expected increase() result for the first point on the graph with value not exceeding 100 2020-12-15 13:40:46 +02:00
Aliaksandr Valialkin
149511f5e9 Do not set GO111MODULE=off during go install, since this doesnt work in Go1.14 and Go1.15 2020-12-15 13:13:46 +02:00
Aliaksandr Valialkin
2813d0b1e0 docs/CHANGELOG.md: mention that vmagent now accepts multiple -remoteWrite.sendTimeout and -remoteWrite.tlsInsecureSkipVerify command-line flags 2020-12-15 12:58:35 +02:00
Nikolay
95c9b630cc adds new Array Flags (#965)
* adds ArrayDuration and ArrayBool flags,
makes sendTimeout and tlsInsecure configurable per remoteWrite url

* added backward compatibility testcases for ArrayDuration and ArrayBool

* fixes bool flag

* fixes test cases
2020-12-15 12:51:12 +02:00
Aliaksandr Valialkin
60fcac4878 lib/promscrape: add bootstrap styles to /targets html page 2020-12-15 12:37:56 +02:00
Aliaksandr Valialkin
5af2a9ca0e lib/promscrape: formatting fixes for /tarets page 2020-12-15 11:59:04 +02:00
Aliaksandr Valialkin
020917949b lib/promscrape: formatting fixes for /targets page 2020-12-15 11:24:18 +02:00
Aliaksandr Valialkin
4e48067133 .github/workflows/main.yml: set GO111MODULE=off when installing auxiliary tools via go install 2020-12-15 01:03:11 +02:00
Aliaksandr Valialkin
ae3675d3d0 docs/CHANGELOG.md: mention about adding query, first and value functions to alert templates
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/539
2020-12-14 20:17:11 +02:00
Roman Khavronenko
6247884057 vmalert: add function "query", "first" and "value" to alert templates functions (#960)
The commit adds a support for template function `query`,
`first` and `value`. The function `query` executes
a MetricsQL query for active alerts. In vmalert we
update templates on every evaluation for active alerts
to keep them up to date. With `query` func it may become
a perf issue since it will fire a query on every execution.
We should keep it in mind for now.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/539
2020-12-14 20:11:45 +02:00
Aliaksandr Valialkin
0b2726c3be all: use go install instead of go get for installing auxiliary tools
This is a preparation for Go 1.16, which deprecates `go get` for installing binaries.
See https://tip.golang.org/doc/go1.16#go-command :

  go install, with or without a version suffix (as described above), is now the recommended way
  to build and install packages in module mode. go get should be used with the -d flag to adjust
  the current module's dependencies without building packages, and use of go get to build and install
  packages is deprecated. In a future release, the -d flag will always be enabled.
2020-12-14 20:07:50 +02:00
Aliaksandr Valialkin
5d426dfe0a docs/CHANGELOG.md: mention bugfix for proper recovering from incorrectly stored persistent queue
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/964
2020-12-14 19:28:47 +02:00
Aliaksandr Valialkin
d006b41eff lib/persistentqueue: verify that ReaderOffset doesnt exceed WriterOffset when opening the persistent queue
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/964
2020-12-14 19:25:25 +02:00
Aliaksandr Valialkin
ae972429c7 lib/promscrape: add missing whitespace between duration and ago word at /targets page 2020-12-14 14:19:58 +02:00
Aliaksandr Valialkin
f8e7f433cf app/victoria-metrics: prettify / page output 2020-12-14 14:07:58 +02:00
Aliaksandr Valialkin
069c9ade52 app/{vmagent,vminsert}: follow-up for ce8c2dd1f1: return /targets page in HTML when requested via web browser 2020-12-14 14:06:00 +02:00
Nikolay
ce8c2dd1f1 Changes targets api (#961)
* changes /targets api
adds html response if requester accepts text/html,
adds quick template for /targets api,
fixes pathPrefix for / requests

* changes namings

* renamed targets file

* Update app/victoria-metrics/main.go

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>

* adds trimspace to qtpl,
moves content-type for targets response closer to writer

* fixes bug with prefix

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-12-14 13:36:48 +02:00
Aliaksandr Valialkin
5ebfc275e6 app/victoria-metrics: automatically reset response cache when samples with too timestamps older than now - search.cacheTimestampOffset are ingested 2020-12-14 13:08:28 +02:00
Aliaksandr Valialkin
f93247e82d docs/MetricsQL.md: clarify that limitk(k, q) returns an arbitrary set of k time series with each call
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/951
2020-12-12 14:17:37 +02:00
Aliaksandr Valialkin
c4c90ab2b1 vendor: make vendor-update 2020-12-11 23:31:00 +02:00
Aliaksandr Valialkin
ae10ff8ccd .github/ISSUE_TEMPLATE/bug_report.md: add a link to upgrade procedure 2020-12-11 22:09:35 +02:00
Aliaksandr Valialkin
4862edfef3 docs/FAQ.md: use less confusing links in the chapter explaining why VictoriaMetrics doesnt support Prometheus remote_read API 2020-12-11 21:23:20 +02:00
Aliaksandr Valialkin
9d42546a27 docs: consistently use links to https://victoriametrics.github.io for documentation references 2020-12-11 21:08:18 +02:00
Aliaksandr Valialkin
710f8ce5aa docs/Single-server-VictoriaMetrics.md: clarify docs in Relabeling section 2020-12-11 18:23:52 +02:00
Aliaksandr Valialkin
081aa4ad68 docs/CHANGELOG.md: mention https://github.com/VictoriaMetrics/VictoriaMetrics/issues/955 2020-12-11 17:48:26 +02:00
Aliaksandr Valialkin
5f9d88a3cb lib/promscrape/discovery/consul: reduce load on Consul API server by increasing timeout for blocking requests from 50 seconds to 9 minutes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-12-11 17:24:13 +02:00
Aliaksandr Valialkin
ba8ac08739 app/vmselect/graphite: properly handle wildcards and charsets inside curly braces
For example, `foo{bar*,[a-f]a*b}` should match `foobar`, `foobar123`, `foofab`, etc.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/952
2020-12-11 17:24:12 +02:00
Aliaksandr Valialkin
e7d8d84396 docs/Single-server-VictoriaMetrics.md: sync with upstream README.md via make docs-sync 2020-12-11 12:08:35 +02:00
faceair
30445ed5e9 docs/CaseStudies.md: add case study for zhihu (#956) 2020-12-11 12:07:30 +02:00
Aliaksandr Valialkin
82afcb6d0d docs/Single-server-VictoriaMetrics.md: clarify that the recommended value for -dedup.minScrapeInterval is scrape_interval from Prometheus configs 2020-12-09 12:16:04 +02:00
Aliaksandr Valialkin
3ca1ed0fde docs/CHANGELOG.md: mention about memory leak fix in vmagent when big number of targets is discovered via service discovery 2020-12-09 10:35:26 +02:00
Aliaksandr Valialkin
b13680a67e docs/Single-server-VictoriaMetrics.md: sync with upstream README.md via make docs-sync 2020-12-09 10:27:11 +02:00
Akira Kurogane
0066a02293 Wording suggestion for lack of native format spec (#948)
This diff is just to suggest wording to let people know there is no future-compatible guaranteed way to make their own native format files for import yet.
2020-12-09 10:25:56 +02:00
Aliaksandr Valialkin
fd9fd191b9 lib/promscrape/discovery/consul: properly pass Datacenter filter to Consul API server
Previously it has been passed as `sdc` query arg, while it should be passed as `dc` query arg.
See https://www.consul.io/api-docs/health#list-nodes-for-service for details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574#issuecomment-740454170
2020-12-08 21:52:42 +02:00
Aliaksandr Valialkin
4146fc4668 all: properly handle CPU limits set on the host system/container
This can reduce memory usage on systems with enabled CPU limits.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/946
2020-12-08 21:07:29 +02:00
Aliaksandr Valialkin
364f30a6e7 lib/promscrape: store ScrapeWork items by pointer in the slice returned from get*ScrapeWork()
This should prevent from possible 'memory leaks' when a pointer to ScrapeWork item stored in the slice
could prevent from releasing memory occupied by all the ScrapeWork items stored in the slice when they
are no longer used.

See the related commit e205975716 and the related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-12-08 17:50:05 +02:00
Aliaksandr Valialkin
1906f841c9 app/vmselect/promql: do not reduce lookbehind window for any_rollup_func(m) to -search.maxStalenessInterval. It should equal to step value passed to /api/v1/query_range as most users expect 2020-12-08 15:16:17 +02:00
kreedom
26df320be5 Create CODE_OF_CONDUCT_RU.md 2020-12-08 14:59:01 +02:00
Aliaksandr Valialkin
b6b1b06d70 app/{vmalert,vmagent}: skip empty values in -remoteWrite.label and -label lists 2020-12-08 14:55:13 +02:00
kreedom
5454668709 Update CODE_OF_CONDUCT.md 2020-12-08 14:48:49 +02:00
Aliaksandr Valialkin
c8133cbb16 .github/ISSUE_TEMPLATE/bug_report.md: mention that it is recommended reading troubleshooting docs before reporting the bug 2020-12-08 14:35:53 +02:00
Aliaksandr Valialkin
30deb2b548 .github/ISSUE_TEMPLATE/bug_report.md: recommend updating to the latest release before reporting the bug 2020-12-08 14:33:42 +02:00
Aliaksandr Valialkin
08b71d2067 lib/promscrape: re-use strings for labels stored in ScrapeWork
This should reduce memory usage when working with big number of scrape targets.
2020-12-08 12:22:59 +02:00
Aliaksandr Valialkin
0f1b969aa6 lib/promscrape: export vm_promscrape_scrapers_{started|stopped}_total metrics for monitoring target churn rate 2020-12-08 11:57:52 +02:00
Aliaksandr Valialkin
c7ac7c1807 lib/promscrape: store targetStatus entries in targetStatusMap by pointer instead of by value
This guarantees that GC frees memory occupied by targetStatus after it is unregistered from targetStatusMap.
2020-12-08 11:50:48 +02:00
Aliaksandr Valialkin
05813259dc lib/promscrape: export vm_promscrape_active_scrapers{type="<sd_type>"} metric for tracking the number of active scrapers per each service discovery type 2020-12-08 01:54:23 +02:00
Aliaksandr Valialkin
9c1c9d8e76 lib/promscrape: do not enable strict config parsing when -promscrape.config.dryRun command-line flag is passed
Strict parsing for -promscrape.config can be enabled by passing `-promscrape.config.strictParse` command-line flag.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/944
2020-12-07 13:18:50 +02:00
Aliaksandr Valialkin
007dbf273d app/vmselect/graphite: remove duplicate name tag from /tags/autoComplete/tags handler
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/942
2020-12-07 01:08:52 +02:00
Aliaksandr Valialkin
82972a8f2a lib/promscrape: mention in scrape error message that scrape errors can be disabled by -promscrape.suppressScrapeErrors command-line flag 2020-12-06 23:27:58 +02:00
Roman Khavronenko
83c0c241a7 dashboard: release to grafana.com (#940) 2020-12-06 13:34:19 +02:00
Aliaksandr Valialkin
299a35948c lib/promscrape: clarify error message on failed connection to scrape target when -enableTCP6 command-line flag isn't set 2020-12-06 13:18:39 +02:00
Aliaksandr Valialkin
b0e4b234cb lib/protoparser/influx: allow multiple whitespace chars between measurement, fields and timestamp in Influx line protocol 2020-12-06 12:01:27 +02:00
Roman Khavronenko
6f0038209c dashboard: Prometheus compatibility fix for Storage full ETA panel (#938) 2020-12-06 01:20:07 +02:00
Aliaksandr Valialkin
ae1db8fa08 docs/CHANGELOG.md: cut v1.49.0 2020-12-05 13:49:04 +02:00
Aliaksandr Valialkin
0e46e8df8d vendor: make vendor-update 2020-12-05 12:46:54 +02:00
Aliaksandr Valialkin
d305cc2017 deployment/docker: update Go builder from v1.15.5 to v1.15.6
This fixes issues found in Go since v1.15.5 - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.6+label%3ACherryPickApproved
2020-12-05 12:37:34 +02:00
Aliaksandr Valialkin
e2e8ef86d9 app/vmselect/promql: add count_eq_over_time(m[d], N) and count_ne_over_time(m[d], N) for calculating the number of samples in m over d that are equal / not equal to N 2020-12-05 12:30:46 +02:00
Aliaksandr Valialkin
52915c8f7e lib/promscrape/discoveryutils: remove limit on the number of concurrently running blocking queries
Too low limit could result in unexpected errors when performing big number of blocking queries.
2020-12-05 12:15:52 +02:00
Aliaksandr Valialkin
eb27dbde13 lib/flagutil: make golangci-lint happy by using strings.TrimPrefix instead of manual prefix removal via strings.HasPrefix 2020-12-03 22:07:57 +02:00
Aliaksandr Valialkin
9d787f9edd all: do not print usage info for all the flags when incorrect command-line flag is passed
This should improve usability for VictoriaMetrics apps that have big number of command-line flags,
i.e. all the apps.
2020-12-03 21:47:37 +02:00
Aliaksandr Valialkin
66379cc69f app/vmselect/promql: add label_uppercase(q, label1, ... labelN) and label_lowercase(q, label1, ... labelN) functions
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/936
2020-12-03 21:47:36 +02:00
Aliaksandr Valialkin
d0e1589ea9 vendor: make vendor-update 2020-12-03 20:16:30 +02:00
Aliaksandr Valialkin
de0643fab5 lib/promscrape/discovery/consul: log the time needed for stoppig Consul service watcher 2020-12-03 20:14:55 +02:00
Aliaksandr Valialkin
9cd8eb92f1 lib/promscrape/discovery/consul: make sure that block response contains X-Consul-Index header 2020-12-03 20:05:23 +02:00
Aliaksandr Valialkin
5009b25a03 lib/promscrape: code cleanup after c6dee6c52d
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-12-03 19:50:53 +02:00
Nikolay
c6dee6c52d Changes consul discovery api (#921)
* adds consul watch api,
it must reduce load on consul service with blocking wait requests,
changed discoveryClient api with fetchResponseMeta callback.

* small fix

* fix after master merge

* adds watch client at discovery utils

* fixes consul watcher,
changes namings,
fixes data race

* small typo fix

* sanity fix

* fix naming and service node update
2020-12-03 19:47:40 +02:00
Aliaksandr Valialkin
a7fc84b390 docs/Single-server-VictoriaMetrics.md: update features chapter according to the latest developments 2020-12-03 13:01:19 +02:00
Aliaksandr Valialkin
2f777d996d README.md: remove duplicate provide word 2020-12-03 09:47:34 +02:00
Aliaksandr Valialkin
44a34a0f5f app/vmselect/promql: make fmt 2020-12-02 21:33:35 +02:00
Aliaksandr Valialkin
4910bac46b docs/FAQ.md: add a link to https://valyala.medium.com/prometheus-vs-victoriametrics-benchmark-on-node-exporter-metrics-4ca29c75590f in performance comparisons section 2020-12-02 21:25:52 +02:00
Aliaksandr Valialkin
1982505c2b app/vmselect/promql: return nan from minute(m) when m equals to nan
This aligns VictoriaMetrics behaviour with Prometheus behaviour.

The issue has been spotted in https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 20:16:58 +02:00
Aliaksandr Valialkin
9d87496b50 app/vmselect/promql: do not return 0 value from sum_over_time(m[d]) when there are no samples on the given d window.
This aligns the behaviour of `sum_over_time()` with other `_over_time()` functions and with Prometheus behavior.
2020-12-02 13:12:50 +02:00
Aliaksandr Valialkin
91a4c279cc app/vmselect: return metric values from time() cmp_op metric query when cmp_op comparison is true
This aligns MetricsQL behavior to Prometheus' one.

The issue has been identified at https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 12:09:34 +02:00
Aliaksandr Valialkin
7590b8477b vendor: upgrade github.com/VictoriaMetrics/metricsql from v0.7.2 to v0.7.3
This fixes parsing of hex numbers in MetricsQL such as 0x3b

The bug has been detected at https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 08:10:43 +02:00
Aliaksandr Valialkin
b1fd390e16 docs/Articles.md: add a link to https://victoriametrics.medium.com/how-to-monitor-go-applications-with-victoriametrics-c04703110870 2020-12-02 07:21:20 +02:00
Aliaksandr Valialkin
5bf14991a3 docs/Articles.md: add a link to an article on how Percona PMM has been migrated from Prometheus to VictoriaMetrics 2020-12-02 07:21:19 +02:00
Aliaksandr Valialkin
700bda8e2e app/vmselect/promql: return nan from a >bool b if a is nan in the same way as Prometheus does 2020-12-02 00:28:26 +02:00
Aliaksandr Valialkin
efdc3c71af app/vmselect/searchutils: return elapsed time in Deadline.String() output
This should improve debuggability for error messages containing Deadline.String() output
2020-12-01 00:15:18 +02:00
Aliaksandr Valialkin
ca091bade3 app/vmbackup/snapshot: add missing status code check for the returned response when working with snapshot API
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/929
2020-11-30 14:49:39 +02:00
Aliaksandr Valialkin
b35b3dc043 app/vmbackup/snapshot: log url and response body on failed JSON response parsing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/929
2020-11-29 12:16:16 +02:00
Nikolay
0463cb5550 fixes checksum calculation (#928)
* fixes checksum calculation,
'for' rule param wasnt marshal properly during checksum calculation

* fixes error
2020-11-29 09:48:42 +02:00
Aliaksandr Valialkin
357f886f97 Revert "lib/storage: add missing (AccountID, ProjectID) in MetricName.String() test"
This reverts commit f3e80eb70e493620e42a0cc22a62c9af75076c77, since it isn't needed for single-node version of VictoriaMetrics
2020-11-29 01:27:25 +02:00
Aliaksandr Valialkin
ace969d595 lib/storage: add missing (AccountID, ProjectID) in MetricName.String() test 2020-11-29 01:26:04 +02:00
Aliaksandr Valialkin
32869e4c0f lib/promscrape: fix failing tests after a906b3862f 2020-11-29 01:26:03 +02:00
Aliaksandr Valialkin
a906b3862f lib/protoparser/prometheus: properly parse OpenMetrics timestamps
OpenMetrics timestamps are floating-point numbers, that represent Unix timestamp in seconds.
This differs from Prometheus exposition format, where timestamps are integer numbers representing Unix timestamp in milliseconds.
2020-11-27 14:54:29 +02:00
Aliaksandr Valialkin
eedb79ead8 sync with README.md from single-node version 2020-11-27 13:22:41 +02:00
Karsonito
ae457828bc update carbonapi link (#927) 2020-11-27 13:20:53 +02:00
Aliaksandr Valialkin
51652f638f docs/Articles.md: add https://valyala.medium.com/prometheus-vs-victoriametrics-benchmark-on-node-exporter-metrics-4ca29c75590f 2020-11-27 10:24:50 +02:00
Aliaksandr Valialkin
3a32789352 lib/promscrape: reduce memory allocations when unpacking gzipped responses received from scrape targets 2020-11-26 18:32:06 +02:00
Aliaksandr Valialkin
2cea4d403f all: typo fix: thouthand->thousand 2020-11-26 13:33:46 +02:00
Aliaksandr Valialkin
3dffc6099e vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.8 to v1.0.9 2020-11-26 13:27:12 +02:00
Aliaksandr Valialkin
b0a5c382ee lib/promscrape: release http response non-200 status code 2020-11-26 13:25:17 +02:00
Aliaksandr Valialkin
1de1774de6 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.7 to v1.0.8 2020-11-26 12:13:03 +02:00
John Belmonte
067188501f dashboard: incorporate dedup rate into storage ETA (#920)
* dashboard: incorporate dedup rate into storage ETA

address #916

* exclude dedups during query and simplify
2020-11-26 10:27:54 +02:00
Aliaksandr Valialkin
4cb6bcd2d7 docs/CHANGELOG.md: cut v1.48.0 release 2020-11-26 02:05:57 +02:00
Aliaksandr Valialkin
6b1317b6a4 docs/CHANGELOG.md: add a link to Netflix Eureka - https://github.com/Netflix/eureka 2020-11-26 01:36:20 +02:00
Aliaksandr Valialkin
b7fcdb528d app/{vmagent,victoria-metrics}: add -dryRun option and make more clear handling for -promscrape.config.dryRun 2020-11-25 22:59:13 +02:00
Aliaksandr Valialkin
dabbf930d8 app/vmagent: do not enable -promscrape.config.strictParse when -dryRun command-line flag is set
Users can specify -promscrape.config.strictParse if -promscrape.config shouldn't contain unknown config entries
2020-11-25 22:26:25 +02:00
Aliaksandr Valialkin
1c669a69a8 lib/mergeset: tune the number of rawItemsBlocks to merge at once
512 blocks give higher ingestion performance and slightly lower memory usage
2020-11-25 21:52:52 +02:00
Aliaksandr Valialkin
7119f294f3 lib/mergeset: help GC by removing refereces to slices in inmemoryBlock.Reset 2020-11-25 21:19:43 +02:00
Aliaksandr Valialkin
8a057e705a lib/storage: log metric name plus all its labels when the metric timestamp is outside the configured retention
This should simplify debugging when the source of the metric with unexpected timestamp must be found.
2020-11-25 14:41:37 +02:00
Aliaksandr Valialkin
b65236530c lib/storage: typo fix in error message: allowd->allowed 2020-11-25 14:15:42 +02:00
Aliaksandr Valialkin
ae04378424 lib/protoparser/prometheus: properly parse "infinity" values in OpenMetrics format
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/924
2020-11-24 19:03:38 +02:00
Aliaksandr Valialkin
bf95fbfc1d lib/logger: disable rate limiting for error and warn logs by default 2020-11-24 12:42:25 +02:00
Aliaksandr Valialkin
78d2715d04 all: spelling fix: superflouos->superfluous. This is a follow-up for 0acdab3ab9 2020-11-24 12:42:22 +02:00
Aliaksandr Valialkin
d0ffb49ee2 docs/CHANGELOG.md: mention that /tags/delSeries handler is supported after f0c207fae2 2020-11-24 12:34:56 +02:00
Aliaksandr Valialkin
b7f4fc6e0d lib/protoparser/prometheus: properly parse metrics with exemplars
Examplars have been introduced in OpenMetrics - see https://github.com/OpenObservability/OpenMetrics/blob/master/OpenMetrics.md#exemplars-1
Previously VictoriaMetrics couldn't parse the following metric

    foo{bar="baz"} 123 # exemplar here

This commit fixes this. Note that VictoriaMetrics ignores the exemplar as for now.
2020-11-24 12:34:56 +02:00
Aliaksandr Valialkin
d48363534a docs/Articles.md: add recent articles about VictoriaMetrics 2020-11-24 12:34:56 +02:00
BigFish
0acdab3ab9 Update main.go (#922)
fix spelling mistake
2020-11-23 17:33:17 +02:00
Aliaksandr Valialkin
7e8dcf9ddc app/vmbackup: cosmetic fixes 2020-11-23 17:10:04 +02:00
Aliaksandr Valialkin
aa90b93778 lib/promscrape: expose __meta_ec2_ipv6_addresses label for ec2_sd_config like Prometheus will do in the next release 2020-11-23 16:56:42 +02:00
Aliaksandr Valialkin
de523c81b9 lib/promscrape: add filters option to dockerswarm_sd_config like Prometheus did in v2.23.0 2020-11-23 16:27:40 +02:00
Aliaksandr Valialkin
a724dde90a app/vmselect: protect /tags/delSeries with -deleteAuthKey in the same way as /api/v1/admin/tsdb/delete_series 2020-11-23 15:35:59 +02:00
Aliaksandr Valialkin
fb8e56d8a2 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2020-11-23 15:32:56 +02:00
Aliaksandr Valialkin
f0c207fae2 app/vmselect: add /tags/delSeries handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb
2020-11-23 15:27:21 +02:00
Aliaksandr Valialkin
d3794eb994 app/{vminsert,vmselect}: move /tags/tagSeries and /tags/tagMultiSeries api from vminsert to vmselect
This is needed for consistency, since all the `/tags*` api handlers are located in vmselect.
2020-11-23 12:33:19 +02:00
Aliaksandr Valialkin
f765985947 lib/fs: replace fs.OpenReaderAt with fs.MustOpenReaderAt
All the callers for fs.OpenReaderAt expect that the file will be opened.
So it is better to log fatal error inside fs.MustOpenReaderAt instead of leaving this to the caller.
2020-11-23 09:57:21 +02:00
Aliaksandr Valialkin
e614a14b21 docs: sync with cluster branch 2020-11-23 00:42:04 +02:00
Aliaksandr Valialkin
9d160f9048 lib/promscrape: hint that -enableTCP6 command-line flag can be used for connecting to IPv6 addresses 2020-11-21 14:39:00 +02:00
Aliaksandr Valialkin
d7932775cc lib/promscrape/discovery/eureka: follow-up after eec76718e9 2020-11-20 14:00:12 +02:00
Nikolay
eec76718e9 Adds eureka service discovery (#913)
* Adds eureka service discovery
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/851
Netflix service discovery for AWS

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-20 13:38:12 +02:00
John Belmonte
093a891762 MetricsQL docs: parameter consistency (#915)
* MetricsQL docs: parameter consistency

if I understand correctly:
  * `fun(q)` - fun takes instant vector
  * `fun(m[d])` - fun takes range vector

* Update docs/MetricsQL.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-20 11:42:22 +02:00
Aliaksandr Valialkin
c03e4ef9d6 vendor: make vendor-update 2020-11-19 19:21:12 +02:00
Aliaksandr Valialkin
de7f315231 docs/CHANGELOG.md: mention that slow query log now contains remote client address 2020-11-19 12:41:17 +02:00
Aliaksandr Valialkin
97a0c80904 lib/logger: follow-up for 09105ff49c 2020-11-19 12:37:00 +02:00
Nikolay
09105ff49c Adds log suppression per caller (#908)
* Adds log suppression per caller
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/905

* fixes style and report message
2020-11-19 12:17:23 +02:00
Aliaksandr Valialkin
2859a452d4 app/vmselect: add remoteAddr to slow query log in order to improve debuggability
This will simplify identifying the client that sends slow queries to VictoriaMetrics.
2020-11-18 20:38:32 +02:00
Aliaksandr Valialkin
170e2f54ab docs/CHANGELOG.md: mention about snap install victoriametrics 2020-11-18 19:49:54 +02:00
Aliaksandr Valialkin
8b116b619a docs/CHANGELOG.md: sync with cluster branch 2020-11-18 19:46:05 +02:00
Aliaksandr Valialkin
6e6d62284c docs: make snap install victoriametrics more prominent in docs 2020-11-18 19:44:46 +02:00
S.F
a02a12f639 Fix restart and code review (#912)
On start the daemon may write an empty line.
Log as warning non managed log level.

Thanks Andrew .F. for pointers
2020-11-18 19:30:25 +02:00
Nikolay
f818ab497b Fixes snap script (#909) 2020-11-18 17:46:31 +03:00
Aliaksandr Valialkin
b73802372a docs/Single-server-VictoriaMetrics.md: an attempt to fix markdown formatting in Graphite Tags API section 2020-11-18 14:41:03 +02:00
Aliaksandr Valialkin
2f05f90888 docs: lowercase adidas trademark according to their request 2020-11-18 13:47:35 +02:00
Aliaksandr Valialkin
7e4bcbd853 docs/Cluster-VictoriaMetrics.md: adjust RAM sizing recommendations for vmstorage nodes
It is recommended to have at least of 50% of free RAM on vmstorage nodes in order handle possible
RAM usage spikes during rolling upgrade for vmstorage nodes when time series
are re-routed from temporarily unavailable node to the remaining active nodes.
2020-11-18 13:04:43 +02:00
Aliaksandr Valialkin
a11659013f docs/Single-server-VictoriaMetrics.md: make consistent section title sizes 2020-11-18 12:35:52 +02:00
Aliaksandr Valialkin
a6b2b2c005 lib/logger: add -loggerWarnsPerSecondLimit command-line flag for rate limiting of WARN log messages
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/905
2020-11-18 03:43:37 +02:00
Nikolay
c2afa3fdd7 adds snap package for victoria-metrics (#904)
* adds snap package for victoria-metrics

* Update README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-18 02:00:06 +02:00
Aliaksandr Valialkin
d4cc934c77 README.md: sync with docs/Single-server-VictoriaMetrics.md 2020-11-18 01:38:45 +02:00
Aliaksandr Valialkin
870270c75e docs/Single-server-VictoriaMetrics.md: mention that /internal/force_flush endpoint is mostly needed for testing and debugging 2020-11-18 01:37:51 +02:00
S.F
7addbfc831 fix multi instance logging name, add restore, fix rcctl (#902) 2020-11-16 23:18:13 +02:00
Aliaksandr Valialkin
1c477bc2fc docs/CHANGELOG.md: cut v1.47.0 release 2020-11-16 21:00:06 +02:00
Aliaksandr Valialkin
d57214244d Makefile: add -d flag to go get in vendor-update target
This should skip unnecessary build step for the updated packages
2020-11-16 20:53:25 +02:00
Aliaksandr Valialkin
84b986b2fc vendor: make vendor-update 2020-11-16 20:53:17 +02:00
Aliaksandr Valialkin
1052effb6d docs/Cluster-VictoriaMetrics.md: make docs-sync after 57dc152e9d 2020-11-16 20:20:31 +02:00
Aliaksandr Valialkin
266788be14 app/vmselect: use storage.NewSearchQuery() instead of constructing storage.SearchQuery in-place
This should prevent from bugs when AccountID and ProjectID aren't set in storage.SearchQuery.
2020-11-16 18:24:00 +02:00
Aliaksandr Valialkin
cf18df367d app/vmselect/netstorage: apply Graphite filter after substituting __name__ with name 2020-11-16 15:52:16 +02:00
Aliaksandr Valialkin
72ab3f7230 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2020-11-16 15:35:37 +02:00
Aliaksandr Valialkin
30a922f383 docs/CHANGELOG.md: mention about Graphite Tags API implementation 2020-11-16 15:34:20 +02:00
Aliaksandr Valialkin
2c67232565 app/vmselect/graphite: add /tags/autoComplete/values handler from Graphite Tags API 2020-11-16 15:29:35 +02:00
Aliaksandr Valialkin
86f99c6b55 app/vmselect/graphite: add /tags/autoComplete/tags handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support
2020-11-16 14:50:05 +02:00
Aliaksandr Valialkin
3c1434118e app/vmselect/prometheus: return __name__ label if match[] query to /api/v1/labels matches at least a single time series 2020-11-16 13:54:34 +02:00
Aliaksandr Valialkin
27a417bcd3 app/vmselect/prometheus: improve performance for /api/v1/labels and /api/v1/label/<labelName>/values on time ranges exceeding one day when match[] query arg is set 2020-11-16 13:51:59 +02:00
Aliaksandr Valialkin
6fa806f1ca app/vmselect/prometheus: fix deadlock in /api/v1/series on a time range exceeding one day 2020-11-16 13:30:47 +02:00
Aliaksandr Valialkin
f5500251d9 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2020-11-16 13:21:37 +02:00
Aliaksandr Valialkin
5d6d2ef3a6 docs/CHANGELOG.md: mention about improved performance for /api/v1/series on a time range exceeding one day 2020-11-16 13:21:13 +02:00
Aliaksandr Valialkin
0208d8c103 lib/storage: add a test for Storage.SearchMetricNames 2020-11-16 13:15:16 +02:00
Aliaksandr Valialkin
465923b181 app/vmselect/graphite: add /tags/findSeries handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags
2020-11-16 12:53:13 +02:00
Aliaksandr Valialkin
a1f3795b78 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2020-11-16 04:10:39 +02:00
Aliaksandr Valialkin
414cd39659 app/vmselect/graphite: apply filter then limit 2020-11-16 04:09:14 +02:00
Aliaksandr Valialkin
d100341394 app/vmselect/graphite: add /tags/<tag_name> handler for Graphite Tags API 2020-11-16 03:42:25 +02:00
Aliaksandr Valialkin
6251762787 app/vmselect/graphite: add /tags handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags
2020-11-16 03:00:01 +02:00
Aliaksandr Valialkin
48d033a198 app/vminsert: add /tags/tagSeries and /tags/tagMultiSeries handlers from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#adding-series-to-the-tagdb
2020-11-16 02:39:58 +02:00
Aliaksandr Valialkin
4aaee33860 lib/storage: do not show artifically created label for reverse Graphite labels at /api/v1/labels page 2020-11-16 00:44:35 +02:00
Aliaksandr Valialkin
6c0d36e4a9 app/vmselect: propagate errors from vmstorage to response to the client if -search.denyPartialResponse command-line flag is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/891

This commit also adds `"isPartial":{true|false}` field to `/api/v1/*` responses. `"isPartial":true` is set when the response
is based on a partial data because some of vmstorage nodes weren't available during query processing.
2020-11-14 12:47:48 +02:00
Aliaksandr Valialkin
ef9a8989fd docs/Single-server-VictoriaMetrics.md: document /internal/force_flush endpoint 2020-11-13 18:43:10 +02:00
Aliaksandr Valialkin
5d27642106 docs/Single-server-VictoriaMetrics.md: explain why recently inserted data may be unavailable for querying for a few seconds 2020-11-13 18:33:59 +02:00
Aliaksandr Valialkin
0deabbbb4a lib/protoparser/promremotewrite: log the time spent on unsuccessful data read from the network
This should help with debugging `connection timed out` errors.
2020-11-13 17:49:12 +02:00
Aliaksandr Valialkin
67b41c080d docs/CHANGELOG.md: mentioned that Go builder has been updated from v1.15.4 to v1.15.5
See 3fa9ab4a49 for details.
2020-11-13 16:22:12 +02:00
Vasily
6fcbd17bdd Add omitempty for DisableCompression and DisableKeepAlive fields in ScrapeConfig (#796)
* Add omitempty for DisableCompression and DisableKeepAlive fields in ScrapeConfig

* Add omitempty annotation to all the default/optional values

* Fix annotations after review
2020-11-13 16:19:05 +02:00
Aliaksandr Valialkin
9ce5c0c33f docs/Single-server-VictoriaMetrics.md: sync with single-node README.md 2020-11-13 16:03:21 +02:00
Anton Markelov
c5daf8a27b Add note about maxUniqueTimeseries for export (#898) 2020-11-13 15:31:07 +02:00
Aliaksandr Valialkin
d9d01f976b app/vmselect/promql: remove spikes from increase() and delta() results on time series with spare irregular data points
Do not take into account spare data point value if the next point will is located too far from the current point.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/894
2020-11-13 15:23:44 +02:00
Aliaksandr Valialkin
1f19c167a4 app/vmselect/promql: assume that time series value doesnt change during gaps when calculating increase() and delta()
This should remove unexpected spikes at the end of gaps.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/894
2020-11-13 14:59:24 +02:00
Aliaksandr Valialkin
cdf1e6684b lib/protoparser/opentsdbhttp: increment errors counter on unmarshal errors
This is a follow-up for 149c0c4a6d
2020-11-13 13:23:17 +02:00
Aliaksandr Valialkin
28ea993872 vendor: make vendor-update 2020-11-13 13:09:09 +02:00
Aliaksandr Valialkin
149c0c4a6d lib/protoparser: propagate callback error to the caller of ParseStream for every supported data ingestion protocols
The caller of ParseStream then can generate HTTP 503 responses for non-nil errors occured in callbacks when processing incoming requests.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
2020-11-13 13:05:24 +02:00
Aliaksandr Valialkin
4f8a3af061 lib/protoparser/promremotewrite: synchronously process Prometheus remote_write requests
There is no reason in processing these requests asynchronously in the face of https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
Synchronous processing code is easier to read and understand than the previous async code
2020-11-13 12:17:25 +02:00
Aliaksandr Valialkin
57a4af98fa lib/protoparser/promremotewrite: forward errors, which can occur during data ingestion, to the caller of ParseStream, so it could properly return HTTP 503 status code on non-nil error
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
2020-11-13 11:01:07 +02:00
Aliaksandr Valialkin
3fa9ab4a49 deployment/docker: update Go builder from v1.15.4 to v1.15.5
This fixes the following possible issues in Go - https://github.com/golang/go/issues?q=milestone%3AGo1.15.5+label%3ACherryPickApproved
2020-11-13 11:01:06 +02:00
Aliaksandr Valialkin
47a038401b all: consistently return text-based HTTP responses with charset=utf-8
This is a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/897
2020-11-13 10:35:41 +02:00
faceair
077f8cbe1c add charset on targets response (#897) 2020-11-13 10:17:37 +02:00
Aliaksandr Valialkin
4057305148 docs/vmagent.md: added a link to https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2 into Relabeling section 2020-11-12 12:27:06 +02:00
Aliaksandr Valialkin
bb06b98202 docs/vmagent.md: typo fix 2020-11-11 16:04:46 +02:00
Aliaksandr Valialkin
4adb96161a docs/vmagent.md: add Configuration update section 2020-11-11 16:01:15 +02:00
Aliaksandr Valialkin
4c8e01b312 docs/Single-server-VictoriaMetrics.md: document -search.treatDotsAsIsInRegexps command-line option 2020-11-11 14:59:06 +02:00
immerrr again
51c529a2b6 app/vmstorage: add "/internal/force_flush" endpoint (#893) 2020-11-11 14:40:27 +02:00
Aliaksandr Valialkin
1437d6db0c docs/Single-server-VictoriaMetrics.md: small clarifications in VictoriaMetrics features 2020-11-11 13:47:45 +02:00
Aliaksandr Valialkin
e60c0d0bae docs/Single-server-VictoriaMetrics.md: update the link to enterprise features 2020-11-11 13:42:11 +02:00
Aliaksandr Valialkin
462913ed2f docs/Single-server-VictoriaMetrics.md: mention that /api/v1/status/tsdb handler accepts topN and date query args 2020-11-11 13:38:00 +02:00
Aliaksandr Valialkin
1e69c151eb docs/Cluster-VictoriaMetrics.md: mention about optional topN and date query args for /api/v1/status/tsdb handler 2020-11-11 13:35:38 +02:00
Aliaksandr Valialkin
348edd92fe app/vmselect: add -search.treatDotsAsIsInRegexps command-line flag for automatic escaping of dots in regexp label filters 2020-11-11 12:39:07 +02:00
Aliaksandr Valialkin
352485b0de docs/Single-server-VictoriaMetrics.md: clarify which directories can be removed when recovering from data corruption 2020-11-11 12:39:07 +02:00
Aliaksandr Valialkin
9e40eec7d8 docs/Single-server-VictoriaMetrics.md: add a hint that case studies can be read by clicking on the corresponding link 2020-11-11 12:39:07 +02:00
Aliaksandr Valialkin
e205975716 lib/promscrape: make a copy of ScrapeWork from discovered []ScrapeWork slice instead of referring to an item in this slice
This should prevent from holding previously discovered []ScrapeWork slices when a part of discovered targets changes over time.
This should reduce memory usage for the case when big number of discovered scrape targets changes over time.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-10 16:13:05 +02:00
Aliaksandr Valialkin
6e668fd480 lib/promscrape: pre-allocate slice for discovered targets based on previously discovered targets
This should reduce load on GC a bit when discovering big number of scrape targets
2020-11-10 15:56:51 +02:00
Aliaksandr Valialkin
47390d8947 app/vmselect/promql: do not return data points in the end of the selected time range for time series ending in the middle of the selected time range
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/887
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/845
2020-11-10 14:51:44 +02:00
Aliaksandr Valialkin
ba4a2c8bca app/vmselect: typo fix in a description for -search.minStalenessInterval: mimimum->minimum 2020-11-10 01:18:08 +02:00
Aliaksandr Valialkin
0d7a3f4eb3 docs/CHANGELOG.md: mention abot explicit setting of extra labels in alert entities (see 3adf8c5a6f)
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-11-10 00:40:51 +02:00
Aliaksandr Valialkin
fc499ab501 Move CHANGELOG.md to docs/CHANGELOG.md 2020-11-10 00:36:32 +02:00
Roman Khavronenko
3adf8c5a6f vmalert: explicitly set extra labels to alert entities (#886)
The previous implementation treated extra labels (global and rule labels) as
separate label set to returned time series labels. Hence, time series always contained
only original labels and alert ID was generated from sorted labels key-values.
Extra labels didn't affect the generated ID and were applied on the following actions:
- templating for Summary and Annotations;
- persisting state via remote write;
- restoring state via remote read.

Such behaviour caused difficulties on restore procedure because extra labels had to be dropped
before checking the alert ID, but that not always worked. Consider the case when expression
returns the following time series `up{job="foo"}` and rule has extra label `job=bar`.
This would mean that restored alert ID will be always different to the real time series because
of collision.

To solve the situation extra labels are now always applied beforehand and `vmalert` doesn't
store original labels anymore. However, this could result into a new error situation.
Consider the case when expression returns two time series `up{job="foo"}` and `up{job="baz"}`,
while rule has extra label `job=bar`. In such case, applying extra labels will result into
two identical time series and `vmalert` will return error:
 `result contains metrics with the same labelset after applying rule labels`

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-11-10 00:27:32 +02:00
Denis Fondras
0d1855f661 Update OpenBSD port (#888)
* Update OpenBSD port

* Delete PLIST.orig

Co-authored-by: Charlie Root <root@o3.lab.ledeuns.net>
2020-11-10 00:24:22 +02:00
Aliaksandr Valialkin
bcd139362b lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-10 00:19:57 +02:00
Aliaksandr Valialkin
6c24c5caa3 lib/promscrape: further reduce memory usage for per-scrape target labels by making a copy of actually used labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-09 10:54:42 +02:00
Aliaksandr Valialkin
ef6ab3d2c9 docs/Single-server-VictoriaMetrics.md: typo fix 2020-11-08 13:40:25 +02:00
Aliaksandr Valialkin
41813eb87a CHANGELOG.md: cut v1.46.0 2020-11-07 17:52:45 +02:00
Artem Navoiev
4e391a5e39 [deployment] add vmalert + alertmanager to docker compose (#885) 2020-11-07 17:00:23 +02:00
Aliaksandr Valialkin
bb3b513bdd docs/CHANGELOG.md: make docs-sync 2020-11-07 16:30:58 +02:00
Aliaksandr Valialkin
83df20b5b5 lib/promscrape: clean references to label name and label value strings after applying per-target relabeling
This should reduce memory usage when per-target relabeling creates big number of temporary labels
with long names and/or values.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-07 16:19:59 +02:00
Aliaksandr Valialkin
9e83335ca9 lib/promscrape/discovery/kubernetes: go fmt 2020-11-07 13:03:49 +02:00
Aliaksandr Valialkin
5407eed2f6 lib/promscrape/discovery/kubernetes: reduce memory usage for labels when discovering big number of scrape targets by using string concatenation instead of fmt.Sprintf
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-07 13:03:08 +02:00
Aliaksandr Valialkin
188325f0fc lib/promscrape: eliminate data race in stream parse mode
Previously `-promscrape.streamParse` mode could result in garbage labels for the scraped metrics because of data race.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825#issuecomment-723198247
2020-11-07 12:45:17 +02:00
Aliaksandr Valialkin
55e98e265e docs/CHANGELOG.md: add CHANGELOG header 2020-11-07 01:15:00 +02:00
Aliaksandr Valialkin
dbbc160a40 deployment/docker: update Go builder from v1.15.3 to v1.15.4
This fixes issues found in Go - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.4+label%3ACherryPickApproved
2020-11-07 00:49:42 +02:00
Aliaksandr Valialkin
9c0e2d2a6e vendor: make vendor-update 2020-11-06 13:58:12 +02:00
Aliaksandr Valialkin
82ce930e59 deployment/docker: update root image from alpine:3.12 to alpine:3.12.1 2020-11-06 13:54:36 +02:00
Aliaksandr Valialkin
dd6bfa50e9 app/vmselect/promql: code cleanup after 43823addea 2020-11-06 01:30:50 +02:00
n4mine
43823addea app/vmselect/promql: fix when the parameter of maxValue(), minValue() leading by NaN. it will cause {top,bottom}k_{max,min} return inappropriate result (#883) 2020-11-06 01:29:24 +02:00
Aliaksandr Valialkin
5943f49f60 docs/Articles.md: add a link to https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2 2020-11-05 19:17:59 +02:00
Aliaksandr Valialkin
9deda5107b docs/MetricsQL.md: fix a link to with templates page 2020-11-05 16:12:32 +02:00
Aliaksandr Valialkin
07f7245aeb docs/Articles.md: move third-party articles to the top 2020-11-05 15:03:16 +02:00
Aliaksandr Valialkin
944c5ea331 docs: add CHANGELOG.md 2020-11-05 14:58:22 +02:00
Aliaksandr Valialkin
de81472724 CHANGELOG.md: add a link to issue related to /ready endpoint
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/875
2020-11-05 14:56:41 +02:00
Aliaksandr Valialkin
f733b0ac9d CHANGELOG.md: mention about /ready endpoint in vmagent 2020-11-05 14:54:51 +02:00
Aliaksandr Valialkin
368b69b4c4 app/vmselect: properly handle errors in GetLabelsOnTimeRange and GetLabelValuesOnTimeRange 2020-11-05 01:38:38 +02:00
Aliaksandr Valialkin
1cb78ba1a0 lib/storage: remove data race when updating rowsDeleted 2020-11-05 01:12:21 +02:00
Aliaksandr Valialkin
b378cd6ed8 app/vmselect: optimize querying for /api/v1/labels and /api/v1/label/<name>/values when start and end args are set 2020-11-05 01:01:33 +02:00
Aliaksandr Valialkin
381ad564a2 docs/vmagent.md: update after 4c808d58bf 2020-11-04 20:31:43 +02:00
Nikolay
4c808d58bf Adds ready probe (#874)
* adds leading forward slash check for scrapeURL path
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/835

* adds ready probe for scrape config initialization,
it should prevent metrics loss during vmagent rolling update,
/ready api will return 425 http code, if some scrape config still waits for initialization.

* updates docs

* Update app/vmagent/README.md

* renames var

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-04 20:29:18 +02:00
Aliaksandr Valialkin
c4e8c34d0e docs/CaseStudies.md: add case study for Idealo.de 2020-11-04 20:18:33 +02:00
Aliaksandr Valialkin
b2042a1c30 lib/promscrape: docs update after e4182dd896
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 17:12:30 +02:00
Aliaksandr Valialkin
caeb74f068 app/vmselect: reduce memory usage when query touches big number of time series 2020-11-04 17:04:04 +02:00
Aliaksandr Valialkin
ae91a6883c lib/{storage,mergeset}: clean cached index blocks and inmemory blocks more aggressively
Previously such blocks were cleaned after they weren't accessed during 10 minutes.
Now they are cleaned after one minute of missing access. This should reduce memory usage in general case.
2020-11-04 17:04:04 +02:00
Nikolay
e4182dd896 reduces memory usage for vmagent, (#880)
* reduces memory usage for vmagent,
limits count of droppedTarget, that can be stored for /api/v1/targets page up to 999 items,
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878

* Update app/vmagent/README.md

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-04 17:03:43 +02:00
Aliaksandr Valialkin
b9e5172aa2 Revert "lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets"
This reverts commit 5a9743211f.
2020-11-04 11:45:02 +02:00
Aliaksandr Valialkin
600f225cff Revert "docs/vmagent.md: mention about -promscrape.dropOriginalLabels"
This reverts commit bd81f926a4.
2020-11-04 11:44:57 +02:00
Aliaksandr Valialkin
bd81f926a4 docs/vmagent.md: mention about -promscrape.dropOriginalLabels 2020-11-04 11:16:33 +02:00
Aliaksandr Valialkin
5a9743211f lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 11:08:57 +02:00
Aliaksandr Valialkin
ca8b5745b5 lib/promscrape: reduce memory allocations in promLabelsString() function
This should help with reducing memory usage in https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 10:38:44 +02:00
Aliaksandr Valialkin
f3f62ab04e lib/storage: do not report about the need of free disk space if parts cannot be merged due to too big write amplification 2020-11-03 15:32:02 +02:00
Aliaksandr Valialkin
e0a91ef163 app/vmagent/remotewrite: drop packets only on 409 status code, since there are other valid 4xx status codes, which shouldnt result in packet drop 2020-11-03 14:25:10 +02:00
Aliaksandr Valialkin
c87fb9191e lib/storage: remove unneeded fmt.Sprintf 2020-11-03 14:20:31 +02:00
John Belmonte
51e661ecfe add short_version label to vm_app_version metric (#877)
* add short_version label to vm_app_version metric

use case:  Version panel of Grafana dashboard should use a live query, but currently it uses a template query which becomes stale.  Grafana is not able to preform regex substitution on labels.

* Update metrics.go

* fix compile
2020-11-03 14:10:42 +02:00
Aliaksandr Valialkin
cd071357d8 docs/MetricsQL.md: add missing whitespace 2020-11-02 23:49:56 +02:00
Aliaksandr Valialkin
61579680bb vendor: make vendor-update 2020-11-02 22:03:38 +02:00
Aliaksandr Valialkin
fe289331dd lib/storage: remove obsolete code 2020-11-02 19:11:59 +02:00
Aliaksandr Valialkin
d396c265a6 CHANGELOG.md: cut v1.45.0 2020-11-02 02:43:12 +02:00
Aliaksandr Valialkin
31918f60b2 vendor: make vendor-update 2020-11-02 02:41:02 +02:00
Aliaksandr Valialkin
d62ec1cb01 CHANGELOG.md: add a link to https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825 2020-11-02 02:28:55 +02:00
Aliaksandr Valialkin
5e75c389e6 app/vmselect/promql: allow dropping trailing sample only for default_rollup function
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/850
2020-11-02 02:10:59 +02:00
Aliaksandr Valialkin
c0f3be824d lib/promscrape: properly handle response body after 301 redirect
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/869
2020-11-02 01:09:52 +02:00
Aliaksandr Valialkin
ca566dce39 CHANGELOG.md: mention about packets drop in vmagent like Prometheus does 2020-11-02 00:46:49 +02:00
Aliaksandr Valialkin
0b35da159c app/vmagent/remotewrite: drop packets if remote storage returns 4xx status code
This makes consistent the behaviour with Prometheus.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
2020-11-02 00:45:09 +02:00
Aliaksandr Valialkin
cb71af216a app/vmselect/promql: go fmt 2020-11-02 00:15:29 +02:00
Aliaksandr Valialkin
daacbc7e34 app/vmselect/promql: do not drop trailing datapoints for instant queries
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/845
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-11-02 00:12:37 +02:00
S.F
f477cbe861 OpenBSD packaging files (#853) 2020-11-01 23:39:25 +02:00
Roman Khavronenko
50d44d5932 dashboard: add Storage full ETA panel (#858)
* dashboard: add `Storage full ETA` panel

The new panel suppose to help to estimate the time needed to run out of free
disk space.
Thx to @belm0 @hekmon

* disable legend for `Storage full ETA` panel
2020-11-01 23:37:31 +02:00
Aliaksandr Valialkin
68d004bc05 CHANGELOG.md: mention about recently added bugfixes 2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
e277c3d07b lib/promscrape: add stream parse mode for efficient scraping of targets that expose millions of metrics 2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
29e4e7f422 lib/storage: drop more samples outside the given retention during background merge
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/17
2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
b7638f04a7 app/vmagent: expose /api/v1/targets page according to https://prometheus.io/docs/prometheus/latest/querying/api/#targets
This page is exposed by vmagent and by a single-node VictoriaMetrics

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/643
2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
c539494b36 app/vmselect/promql: allow passing optional third argument to topk_* and bottomk_* functions in order to obtain sum of time series outside top/bottom K 2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
d12c4914f0 lib/storage: properly handle the case when key="__name__" is passed to MetricName.AddTag* 2020-11-01 23:35:06 +02:00
Aliaksandr Valialkin
64e2d66014 lib/storage: code cleanup after 5bfd4e6218 2020-11-01 23:35:06 +02:00
Sergey Klyuykov
4108e85efd Fix InfluxDB support on docker-compose deployment. (#872)
* Added UDP protocol support for Graphite/Influx in docker-compose deployment.

This is necessary for Proxmox VE External Metric Server support.
https://pve.proxmox.com/wiki/External_Metric_Server

* Added `influxListenAddr` in docker-compose deployment.

This is necessary for Proxmox VE External Metric Server support.
https://pve.proxmox.com/wiki/External_Metric_Server

Additionally created Grafana Dashboard for monitoring Proxmox VE hosts.
https://grafana.com/grafana/dashboards/13307
2020-11-01 23:34:39 +02:00
Roman Khavronenko
f0bdc5716e vmalert: skip automatically added labels on alerts restore (#871)
Label `alertgroup` was introduced in #611 and automatically added to generated
time series. By mistake, this new label wasn't correctly purged on restore event
and affected alert's ID uniqueness. This commit removes `alertgroup` label
in restore function.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-10-30 08:18:20 +00:00
Nikolay
67059caa12 fixes panic at scrape error body formating, (#868)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/864
regression after body reuse improvements
2020-10-29 17:17:52 +03:00
Nikolay
de3fe22815 adds leading forward slash check for scrapeURL path (#855)
* fixes in-consistency with prometheus behaviour for scrape targets url path.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/835
2020-10-29 08:39:42 +03:00
Sergey Kulukov
055f152246 Added UDP protocol support for Graphite/Influx in docker-compose deployment.
This is necessary for Proxmox VE External Metric Server support.
https://pve.proxmox.com/wiki/External_Metric_Server
2020-10-28 20:26:55 +02:00
Roman Khavronenko
20311f6065 dashboard: clarify the purpose of Concurrent flushes on disk panel (#849)
Current description led to confusion at https://victoriametrics.slack.com/archives/CGZF1H6L9/p1603270014273800
2020-10-28 18:10:46 +00:00
kreedom
a51a7b2a20 vmbackup fix panic when no origin fs given (#859)
* use fsnil when no origin fs
2020-10-28 20:09:10 +02:00
Aliaksandr Valialkin
bca468bb55 CHANGELOG.md: mention about recently added changes 2020-10-20 14:32:14 +03:00
Aliaksandr Valialkin
0729cc36b2 lib/memory: do not print trailing zeroes in logs for -memory.allowedPercent command-line flag 2020-10-20 14:32:07 +03:00
Aliaksandr Valialkin
5bfd4e6218 app/vmstorage: support for -retentionPeriod smaller than one month
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/173
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/17
2020-10-20 14:31:44 +03:00
Aliaksandr Valialkin
920300643a docs/vmrestore.md: make docs-sync 2020-10-20 10:48:19 +03:00
kreedom
ef77120170 vmalert - add dryRun (#842)
vmalert: add `dryRun` flag for rules validation without running the service
2020-10-20 08:15:21 +01:00
Seva Poliakov
b3f3c078e5 Fix typo in vnrestore readme 2020-10-18 15:41:39 +03:00
faceair
84e3881c0b disable response compression on websocket (#841) 2020-10-17 13:32:34 +03:00
Aliaksandr Valialkin
2ed069c3bc docs/MetricsQL.md: small clarifications 2020-10-17 12:01:43 +03:00
Aliaksandr Valialkin
28353e48ca app/vmselect/promql: an attempt to improve heuristics for dropping trailing data points in time series
Now trailing data points are additionally dropped for time series with a single raw sample

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-10-17 10:44:34 +03:00
Aliaksandr Valialkin
01987f8c77 lib/storage: small code adjustements after d2960a20e0
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/781
2020-10-17 01:16:54 +03:00
faceair
d2960a20e0 evaluate the execution cost of all tag filters (#824)
* evaluate the execution cost of all tag filters

* fix suffixes typo
2020-10-17 00:46:55 +03:00
Aliaksandr Valialkin
d4f12e0fbb CHANGELOG.md: mention about improved openstack endpoint handling
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728
2020-10-16 23:06:33 +03:00
Nikolay Khramchikhin
e6ab69dd88 fixes openstack api endpoint with suffix trim adds openstack (#840)
api v2.0 check
2020-10-16 21:20:57 +03:00
Aliaksandr Valialkin
ed5f05024b deployment/docker: update Go builder from Go1.15.2 to Go1.15.3
This should fix potential issues related to Go runtime - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.3+label%3ACherryPickApproved
2020-10-16 15:08:52 +03:00
Aliaksandr Valialkin
43aa737e23 vendor: make vendor-update 2020-10-16 15:06:27 +03:00
Aliaksandr Valialkin
46dccc1088 CHANGELOG.md: describe added optimization cases from 96cdfcba50 2020-10-16 12:59:42 +03:00
Aliaksandr Valialkin
96cdfcba50 vendor: update github.com/VictoriaMetrics/metricsql from v0.7.1 to v0.7.2
The new release of github.com/VictoriaMetrics/metricsql adds more optimizations for `foo{filters1} op bar{filters2}`:

* rollup_func(foo[d]) op bar{filters}
* transform_func(foo) op bar{filters}
* num_or_scalar op bar op baz{filters}
2020-10-16 12:53:36 +03:00
Aliaksandr Valialkin
09d60d64a9 docs: add a link to https://smarketshq.com/monitoring-kubernetes-clusters-41a4b24c19e3 article about VictoriaMetrics 2020-10-16 09:07:41 +03:00
Aliaksandr Valialkin
c37e5de66f docs/Single-server-VictoriaMetrics.md: update docs 2020-10-14 13:26:31 +03:00
Aliaksandr Valialkin
3b847d32d9 docs/CaseStudies.md: actualize numbers for Wix.com 2020-10-14 13:07:33 +03:00
Aliaksandr Valialkin
590d8d537f docs/vmalert.md: make docs-sync 2020-10-13 18:34:32 +03:00
Roman Khavronenko
bc42b5598f vmalert: update docs to highlight the state restore requirements; (#833)
Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/830
2020-10-13 18:32:43 +03:00
Aliaksandr Valialkin
94978af9bc CHANGELOG.md: cut v1.44.0 release 2020-10-13 16:59:33 +03:00
Aliaksandr Valialkin
8e20bc7b53 docs/Cluster-VictoriaMetrics.md: clarify RAM requirements for vmstorage nodes 2020-10-13 16:47:51 +03:00
Aliaksandr Valialkin
a2b9476897 app/vmselect/promql: return a single time series at max from absent() function like Prometheus does 2020-10-13 15:56:04 +03:00
Aliaksandr Valialkin
9aa3b65766 app/vmselect/promql: improve time series staleness detection
This should prevent from double counting for time series at the time when it changes label.
The most common case is in K8S, which changes pod uid label with each new deployment.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-10-13 12:19:57 +03:00
Aliaksandr Valialkin
d8af290947 app/vmselect/promql: fix mode_over_time calculations
Previously `mode_over_time` could return garbage due to improper shuffling of input data points.
2020-10-13 11:58:25 +03:00
Aliaksandr Valialkin
1e27420243 app/vmselect/prometheus: fix golangci-lint warning 2020-10-13 09:36:11 +03:00
Aliaksandr Valialkin
4f16a964e3 app/vmselect: add ability to export data in CSV format via /api/v1/export/csv 2020-10-12 20:08:17 +03:00
Aliaksandr Valialkin
4cc6574cea CHANGELOG.md: mention about added Docker Swarm service discovery 2020-10-12 16:17:58 +03:00
Aliaksandr Valialkin
63c4999e06 lib/promscrape: code prettifying after 9bd9f67718 2020-10-12 16:12:36 +03:00
Nikolay Khramchikhin
9bd9f67718 Adds dockerswarm sd (#818)
* adds dockerswarm service discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/656

 Following roles supported: services, tasks and nodes.
 Basic, token and tls auth supported.
 Added tests for labels generation.

* added unix socket support to discovery utils

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-10-12 13:38:21 +03:00
Aliaksandr Valialkin
7f983d461a docs/MetricsQL.md: mention that VictoriaMetrics keeps metric names after applying functions which dont change time series meaning 2020-10-12 13:25:25 +03:00
Aliaksandr Valialkin
3bba6a2199 CHANGELOG.md: mention that VictoriaMetrics keeps metric names when applying functions which don't change time series meaning 2020-10-12 12:55:09 +03:00
Aliaksandr Valialkin
762c967855 app/vmselect/promql: keep metric name after applying more functions, which dont change time series meaning
Functions are:

* keep_last_value
* keep_next_value
* interpolate
* running_min
* running_max
* running_avg
* range_min
* range_max
* range_avg
* range_first
* range_last
* range_quantile
* smooth_exponential

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:47:06 +03:00
Aliaksandr Valialkin
45f7cdc532 Revert "app/vmselect/promql: remove metric name after applying ceil, floor and round functions in order to be more consistent with Prometheus"
This reverts commit ac45082216.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric names:

* ceil
* floor
* round

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:40:34 +03:00
Aliaksandr Valialkin
a94825b169 Revert "app/vmselect/promql: remove metric name after applying clamp_min and clamp_max functions in order to be consistent with Prometheus"
This reverts commit bb61a4769b.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric name:

* clamp_min
* clamp_max

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:38:27 +03:00
Aliaksandr Valialkin
f7d28bddbf Revert "app/vmselect/promql: remove metric name from results of certain rollup functions in order to be consistent with Prometheus"
This reverts commit e5202a4eae.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric name:

* max_over_time
* min_over_time
* avg_over_time
* quantile_over_time
* geomean_over_time
* mode_over_time
* holt_winters
* predict_linear

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:35:18 +03:00
Aliaksandr Valialkin
2749a3c827 docs/Single-server-VictoriaMetrics.md: add missing whitespace 2020-10-09 20:56:26 +03:00
Aliaksandr Valialkin
b449607181 lib/backup: add MustStop() method for all remote filesystems 2020-10-09 15:32:19 +03:00
Aliaksandr Valialkin
cf5f2874cd lib/backup/fslocal: add FS.MustStop() method for stopping bandwidth limiter 2020-10-09 15:12:03 +03:00
Aliaksandr Valialkin
272d6976b3 CHANGELOG.md: update with recent changes 2020-10-09 14:22:05 +03:00
Aliaksandr Valialkin
68f0e00761 app/vmstorage: add vm_rows_added_to_storage_total metric, which shows the total number of rows added to storage since app start 2020-10-09 13:35:48 +03:00
Aliaksandr Valialkin
84227ea2fc app/{vminsert,vmagent}: take into account all the inserted rows before relabeling in vm_rows_inserted_total and vmagent_rows_inserted_total metrics 2020-10-09 13:29:51 +03:00
Aliaksandr Valialkin
f4e8687c88 app/vmalert: accept days, weeks and years in for: part of config like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/817
2020-10-08 20:13:15 +03:00
Aliaksandr Valialkin
561a7619a5 lib/promscrape: fix tests after 71ea4935de 2020-10-08 19:32:36 +03:00
Aliaksandr Valialkin
6105d61d11 docs/vmagent.md: clarify -promscrape.suppressDuplicateScrapeTargetErrors command-line flag usage 2020-10-08 19:24:31 +03:00
Aliaksandr Valialkin
12d2cf3a7a CHANGELOG.md: mention features from 71ea4935de 2020-10-08 19:13:54 +03:00
Aliaksandr Valialkin
71ea4935de lib/promscrape: add -promscrape.suppressDuplicateScrapeTargetErrors command-line flag in order to suppress duplicate scrape target errors
Show also original labels for duplicate targets in error message in order to simplify debugging the issue.

Now `/targets` endpoint accepts optional `show_original_labels=1` query arg, which shows original labels for each target.
This may simplify debugging for target relabeling.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/651
2020-10-08 18:58:30 +03:00
Aliaksandr Valialkin
9b0a5c1028 lib/backup/actions: improve logging to be more clear to humans 2020-10-08 14:23:07 +03:00
Aliaksandr Valialkin
d423d73251 app/vmalert: do not pring description for all the flags on config errors
The description is too big to consume by human and it just distracts humans.
2020-10-08 13:35:57 +03:00
Aliaksandr Valialkin
d8546e972a vendor: make vendor-update 2020-10-08 11:52:01 +03:00
Aliaksandr Valialkin
c9fb217e4e vendor: update github.com/VictoriaMetrics/metricsql from v0.7.0 to v0.7.1 2020-10-08 11:46:51 +03:00
Aliaksandr Valialkin
bec85d5135 CHANGELOG.md: mentioned about the added optimization that adds missing filters to binary operands 2020-10-07 21:23:02 +03:00
Aliaksandr Valialkin
e9f2e2cbc9 app/vmselect/promql: add missing label filters to binary operands before query execution
This implements the optimization described at https://utcc.utoronto.ca/~cks/space/blog/sysadmin/PrometheusLabelNonOptimization

See also https://github.com/cortexproject/cortex/issues/3253
2020-10-07 21:15:09 +03:00
Aliaksandr Valialkin
5ef71974fe CHANGELOG.md: mention about -finalMergeDelay comand-line flag 2020-10-07 18:52:41 +03:00
Dmitry Shihovtsev
92e5d89fc9 Fix typos in the vmalert datasource (#814)
* Fix typos in the vmalert datasource

* Fix typo in the vmalert datasource test
2020-10-07 17:59:50 +03:00
Artem Navoiev
8e6eb2cd6b update go action 2020-10-07 17:48:42 +03:00
Aliaksandr Valialkin
af90b3121c app/vmstorage: add -finalMergeDelay command-line flag for configuring the delay before final merge for per-month partitions after no new data is ingested to it 2020-10-07 17:35:44 +03:00
Aliaksandr Valialkin
e9d99021b0 docs/CaseStudies.md: actualize Wix numbers 2020-10-06 16:09:35 +03:00
Aliaksandr Valialkin
5aa269def6 CHANGELOG.md: add missing link to an issue about OpenStack service discovery - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728 2020-10-06 15:37:36 +03:00
Aliaksandr Valialkin
d16dbfd639 app/vmagent: add a link to https://victoriametrics.github.io/vmagent.html from main page 2020-10-06 15:29:49 +03:00
Aliaksandr Valialkin
cfd720e772 app/victoria-metrics: add a link to https://victoriametrics.github.io/ from main page 2020-10-06 15:29:49 +03:00
Aliaksandr Valialkin
e10c484a8e docs/Articles.md: add https://medium.com/@VictoriaMetrics/anomaly-detection-in-victoriametrics-9528538786a7 2020-10-06 15:29:49 +03:00
Aliaksandr Valialkin
2a6fa53957 CHANGELOG.md: cut v1.43.0 release 2020-10-06 14:28:50 +03:00
Aliaksandr Valialkin
5a8553bfd2 CHANGELOG.md: add missing entries for upcoming release 2020-10-06 12:04:38 +03:00
Aliaksandr Valialkin
e19d400230 lib/protoparser/graphite: support parsing floating-point timestamp like Graphite does
Such timestamps are rounded to seconds like Carbon does.
See b0ba62a62d/lib/carbon/protocols.py (L197)
2020-10-06 11:38:29 +03:00
Aliaksandr Valialkin
90aa2a8ffd lib/promscrape/discovery/openstack: show expiration time for refreshed OpenStack token in seconds - this is easier to interpret by human 2020-10-06 11:34:09 +03:00
Aliaksandr Valialkin
cc08648699 vendor: make vendor-update 2020-10-05 23:21:41 +03:00
Aliaksandr Valialkin
129b07113e .github/workflows: switch Go version from v1.14 to v1.15 2020-10-05 22:00:51 +03:00
Aliaksandr Valialkin
aba899c298 lib/promscrape/discovery/openstack: code prettifying after cbe3cf683b
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728
2020-10-05 18:11:55 +03:00
Aliaksandr Valialkin
991fad7855 docs: make docs-sync after cbe3cf683b 2020-10-05 16:47:57 +03:00
Nikolay Khramchikhin
cbe3cf683b Adds openstack sd (#811)
* adds openstack service discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728

 implemented hypervisors and instance discovery with openstack v3 api.
 Added tests for labeling and data parsing.
 Added token refresh.

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-10-05 16:45:33 +03:00
Aliaksandr Valialkin
f42194d817 lib/promrelabel: make a copy of label with new name for action: labelmap in the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/812
2020-10-05 16:19:19 +03:00
Aliaksandr Valialkin
bbeac0ba46 lib/protoparser/influx: add -influx.maxLineSize command-line flag for configuring the maximum size for a single Influx line during parsing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/807
2020-10-05 15:19:05 +03:00
Aliaksandr Valialkin
47db9bb24a lib/decimal: add tests for negative values passed to maxUpExponent 2020-10-05 14:56:45 +03:00
Aliaksandr Valialkin
bc7d67cee2 lib/decimal: properly calibrate scale for blocks with Inf values
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/805
2020-10-05 14:52:44 +03:00
Aliaksandr Valialkin
59c26feefa app/vmselect/promql: fill gaps on graphs for range_* and running_* functions
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/806
2020-10-02 13:59:45 +03:00
Aliaksandr Valialkin
764dc2499f lib/storage: code cleanup after 10f2eedee0
Remove the code that uses metricIDs caches for the current and the previous hour during metricIDs search,
since this code became unused after implementing per-day inverted index almost a year ago.

While at it, fix a bug, which could prevent from finding time series with names containing dots (aka Graphite-like names
such as `foo.bar.baz`).
2020-10-01 19:06:23 +03:00
Aliaksandr Valialkin
10f2eedee0 lib/storage: imrpove cache effectiveness for time series ids matching the given filters
Previously the maximum cache lifetime has been limited by 10 seconds. Now it is extended up to a day.
This should reduce CPU usage in the following cases:

* when querying recently added data with small churn rate for time series
* when querying historical data
2020-10-01 14:38:25 +03:00
Aliaksandr Valialkin
d25dd7fdb6 docs: make docs-sync 2020-09-30 09:50:29 +03:00
Roman Khavronenko
daa2d1c065 vmalert: make maxIdleConnections configurable for datasource HTTP client (#797)
Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/795
2020-09-30 09:49:45 +03:00
Aliaksandr Valialkin
a44e0c6153 vendor: make vendor-update 2020-09-30 08:59:20 +03:00
Aliaksandr Valialkin
a897cf2ec3 docs/Release-Guide.md: mention that CHANGELOG.md must be updated before release 2020-09-30 08:53:17 +03:00
Aliaksandr Valialkin
58465bb29b CHANGELOG.md: release v1.42.0 2020-09-30 08:45:31 +03:00
Aliaksandr Valialkin
e59de98384 CHANGELOG.md: initial commit
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/788
2020-09-30 00:12:32 +03:00
Aliaksandr Valialkin
bec9b31b81 lib/storage: allow set values higher than 1 for vm_merge_need_free_disk_space if there are multiple partitions with deferred merges due to disk space shortage 2020-09-29 22:51:43 +03:00
Aliaksandr Valialkin
44bcda81ab app/vmstorage: rename vm_{big|small}_merge_need_free_disk_space to vm_merge_need_free_disk_space
This simplifies alerting.
2020-09-29 22:44:19 +03:00
Aliaksandr Valialkin
a9db81c4ab app/vmstorage: add metrics for determining whether background merges need additional disk space to complete
These metrics are:

* vm_small_merge_need_free_disk_space
* vm_big_merge_need_free_disk_space

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686
2020-09-29 21:48:33 +03:00
Aliaksandr Valialkin
dbf9402329 docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-29 20:29:46 +03:00
Aliaksandr Valialkin
1137bdec66 docs/Single-server-VictoriaMetrics.md: typo fix: compations -> compactions 2020-09-29 20:27:05 +03:00
Aliaksandr Valialkin
127537d631 app/vmagent/remotewrite: do not show -remoteWrite.url in logs if -remoteWrite.showURL isn't set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/773
2020-09-29 19:49:12 +03:00
Aliaksandr Valialkin
f7636b0342 app/vmselect/graphite: do not substitute path and path. with path.. in /metrics/find/?format=completer output 2020-09-29 18:03:26 +03:00
Aliaksandr Valialkin
76b244cfcf lib/cgroup: do not adjust the number of detected CPU cores via /sys/devices/system/cpu/online
The adjustement increases the resulting GOMAXPROC by 1, which looks confusing to users
as outlined at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-698595309
2020-09-29 13:55:26 +03:00
Aliaksandr Valialkin
7dc67cd883 docs/{vmbackup,vmrestore}: formatting fixes 2020-09-29 13:19:07 +03:00
Aliaksandr Valialkin
efdefbc1cb docs/vmbackup.md: make docs about minio config more prominent 2020-09-29 13:16:04 +03:00
Aliaksandr Valialkin
1659135752 lib/storage: fix tests for 32-bit arches such as GOARCH=386 and GOARCH=arm 2020-09-29 13:10:22 +03:00
Aliaksandr Valialkin
9945b8c98d docs: improve readability a bit 2020-09-29 13:03:38 +03:00
Nikolay Khramchikhin
1e679f3e0d update vmbackup/vmrestore README usage (#794)
* update vmbackup/vmrestore README usage

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/381

with minio and configuration file examples.

* Apply suggestions from code review

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

* added backup/restore docs changes

* added example for relabelConfig flag

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-29 12:53:10 +03:00
Aliaksandr Valialkin
38789e4aa0 lib/storage: fix 32-bit builds for GOARH=386 or GOARCH=arm 2020-09-29 12:40:35 +03:00
Aliaksandr Valialkin
19c0b6f3ef lib/protoparser/prometheus: sort rows before comparing them in TestParseStream, since the order for callback calls is non-deterministic 2020-09-29 12:30:04 +03:00
Aliaksandr Valialkin
7cde336b33 lib/protoparser/prometheus: fix TestParseStream after 124f78857b 2020-09-29 12:11:17 +03:00
Aliaksandr Valialkin
96ee276e6e app/vmselect/prometheus: check for errors returned from bufferedwriter.Write
This makes `make errcheck` happy
2020-09-29 11:37:01 +03:00
Aliaksandr Valialkin
6fdfc67620 app/vmselect/graphite: properly handle case when /metrics/find finds both leaf and node for the given query=prefix.*
In this case only node must be returned with stripped dot in the end of id as carbonapi does
2020-09-29 11:01:59 +03:00
Aliaksandr Valialkin
165c9c6371 .github/workflows: verify builds for vmagent, vmalert, vmbackup and vmrestore 2020-09-29 00:49:20 +03:00
Aliaksandr Valialkin
41f24cdb64 .github/workflows: verify that VictoriaMetrics can be built for GOOS=openbsd 2020-09-29 00:44:44 +03:00
Aliaksandr Valialkin
7673839228 lib/{fs,filestream}: small consistency-related updates after cc90a548b1 2020-09-29 00:42:43 +03:00
Nikolay Khramchikhin
cc90a548b1 added openbsd implementations (#790)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/785

removed fadvise for openbsd, added freespace implemenation for openbsd
2020-09-29 00:29:04 +03:00
Aliaksandr Valialkin
8d5df13c7c vendor: make vendor-update 2020-09-28 21:59:58 +03:00
Aliaksandr Valialkin
7500146321 lib/protoparser: avoid copying of buffer read from the network to unmarshal buffer 2020-09-28 17:19:16 +03:00
Aliaksandr Valialkin
124f78857b app/{vminsert,vmagent}: improve data ingestion speed over a single connection
Process data obtianed from a single connection on all the available CPU cores.
2020-09-28 04:13:08 +03:00
Aliaksandr Valialkin
978c6b4ba9 docs/Cluster-VictoriaMetrics.md: sync with cluster branch 2020-09-28 02:07:55 +03:00
Aliaksandr Valialkin
5cdad60a6f lib/protoparser: use 64KB read buffer instead of default 4KB buffer provided by net/http.Server
This should reduce syscall overhead when reading big amounts of data
2020-09-28 02:07:10 +03:00
Aliaksandr Valialkin
1b3efccb24 app/vmselect: stop /api/v1/export/* execution if client disconnects 2020-09-27 23:53:13 +03:00
Aliaksandr Valialkin
95688cbfc5 all: add native format for data export and import
The data can be exported via [/api/v1/export/native](https://victoriametrics.github.io/#how-to-export-data-in-native-format) handler
and imported via [/api/v1/import/native](https://victoriametrics.github.io/#how-to-import-data-in-native-format) handler.
2020-09-27 19:54:07 +03:00
Aliaksandr Valialkin
b4bf722d8f lib/protoparser: use all the available CPU cores for processing ingested data from a single /api/v1/import stream
Previously a single data ingestion stream to /api/v1/import could load only a single CPU core.
2020-09-26 04:21:32 +03:00
Aliaksandr Valialkin
c00627c103 app/vminsert: code prettifying 2020-09-26 04:13:18 +03:00
Aliaksandr Valialkin
b6a976b98d app/vmagent: reduce memory usage when importing data via /api/v1/import
Previously vmagent could use big amounts of RAM when each ingested JSON line
contained many samples.
2020-09-26 04:10:24 +03:00
Aliaksandr Valialkin
82973f8ae7 Revert "lib/storage: remove unused fetchData arg from BlockRef.MustReadBlock"
This reverts commit bab6a15ae0.

Reason for revert: the `fetchData` arg is used in cluster branch.
Leaving this arg in master branch makes smaller the diff with cluster branch.
2020-09-24 22:44:23 +03:00
Aliaksandr Valialkin
bab6a15ae0 lib/storage: remove unused fetchData arg from BlockRef.MustReadBlock
This arg became unused after 23bdc1f107
2020-09-24 20:48:40 +03:00
Aliaksandr Valialkin
23bdc1f107 app/vmselect/netstorage: do not spend CPU time on unpacking empty blocks during /api/v1/series calls 2020-09-24 20:18:10 +03:00
Aliaksandr Valialkin
24ca30bf66 lib/storage: correctly use maxBlockSize in various checks
Previously `maxBlockSize` has been multiplied by 8 in certain checks. This is unnecessary.
2020-09-24 18:12:56 +03:00
Aliaksandr Valialkin
c584aece38 app/vmselect/promql: properly limit implicitly set rollup window to -search.maxStalenessInterval
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/784
2020-09-23 23:23:59 +03:00
Aliaksandr Valialkin
2985077c35 all: consistently use "%w" formatting in fmt.Errorf for wrapped errors 2020-09-23 22:46:34 +03:00
Aliaksandr Valialkin
30c7269814 vendor: make vendor-update 2020-09-23 14:23:39 +03:00
Aliaksandr Valialkin
27500d7d4c app/vmselect/prometheus: code cleanup after 3ba507000c 2020-09-23 13:04:17 +03:00
Aliaksandr Valialkin
3ba507000c app/vmselect/prometheus: return timestamps from /api/v1/query, which match the time query arg
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/720
2020-09-23 12:58:48 +03:00
Aliaksandr Valialkin
c5ef0e6327 lib/persistentqueue: protect from multiple concurrent opening for the same persistent queue 2020-09-23 02:17:47 +03:00
Aliaksandr Valialkin
bed25e3c24 app/vmselect/netstorage: properly pre-allocate space for sbs 2020-09-22 23:49:55 +03:00
Aliaksandr Valialkin
5c42965853 lib/cgroup: attempt to obtain available CPU cores via /sys/devices/system/cpu/online
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-674423728
2020-09-22 23:27:19 +03:00
Aliaksandr Valialkin
09b0f7c202 app/vmselect/netstorage: release search resources on timeout errors
Previously these resources weren't released, which could lead to resource leaks.
2020-09-22 22:57:38 +03:00
Aliaksandr Valialkin
36eb5427eb vendor: make vendor-update 2020-09-22 17:07:37 +03:00
Aliaksandr Valialkin
31ce0e29cd docs/Single-server-VictoriaMetrics.md: VictoriaMetrics properly stores Inf values after 26115891db 2020-09-22 17:02:39 +03:00
Aliaksandr Valialkin
3b1e3a03e0 app/vmselect: make sure the request doesnt wait in pending queue more than the configured timeout
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-09-22 01:23:19 +03:00
Aliaksandr Valialkin
a69234ed18 lib/storage: code prettifying after be5e1222f3
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/781
2020-09-22 00:36:45 +03:00
faceair
be5e1222f3 add filter to getMetricIDs (#783)
* add getMetricIDs filter

* check nil filter before use
2020-09-22 00:33:43 +03:00
Aliaksandr Valialkin
94f7d00537 docs/vmagent.md: typo fix 2020-09-21 21:49:22 +03:00
Aliaksandr Valialkin
f6f5c4118c docs: make docs-sync 2020-09-21 21:47:47 +03:00
Aliaksandr Valialkin
00b5145c69 app/vmselect/searchutils: fixed tests after 2eb72e09ab 2020-09-21 21:31:38 +03:00
Aliaksandr Valialkin
2eb72e09ab app/vmselect: use time value rounded to seconds if it isnt passed to /api/v1/query
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/720
2020-09-21 21:24:40 +03:00
Aliaksandr Valialkin
29108cc53e lib/logger: add -loggerDisableTimestamps command-line flag for disabling timestamps in logs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/778
2020-09-21 19:28:04 +03:00
Aliaksandr Valialkin
964bc7595c lib/promscrape/discovery/ec2: code prettifying after 312fead9a2
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/771
2020-09-21 18:43:34 +03:00
Nikolay Khramchikhin
312fead9a2 Add improvements to ec2_sd_discovery (#775)
* Add improvements to ec2 discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/771

 role_arn support with aws sts
 instance iam_role support
 refreshing temporary tokens

* Apply suggestions from code review

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

* changed implementation, removed tests, clean up code

* moved endpoint builder into getEC2APIResponse

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-21 16:04:15 +03:00
Aliaksandr Valialkin
1e1a27d803 app/vmalert: remove unneeded UTC() call
UTC() doesn't change the underlying timestamp, so the call isn't needed here
2020-09-21 15:55:59 +03:00
Aliaksandr Valialkin
9739283dad lib/storage: reduce CPU load for idle VictoriaMetrics by reducing the frequency for the need for background merges 2020-09-21 15:54:11 +03:00
Roman Khavronenko
5dffc7a553 vmalert: add support for datasource.lookback flag (#779)
New datasource flag `datasource.lookback` defines how far to look into
past when evaluating queries.

Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/668
2020-09-21 15:53:49 +03:00
Roman Khavronenko
82c3bbce34 vmalert: fix the typo in error message (#782)
The error will be always nil so no sense in printing it.
2020-09-21 11:34:23 +03:00
Aliaksandr Valialkin
3e8569f456 lib/decimal: optimize maxUpExponent() by eliminating division from hot path 2020-09-19 13:50:09 +03:00
Aliaksandr Valialkin
f00e0e0103 lib/persistentqueue: sync data to file inside filestream.Writer.MustFlush 2020-09-19 12:51:41 +03:00
Aliaksandr Valialkin
26115891db lib/decimal: properly store Inf values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-18 19:07:07 +03:00
Aliaksandr Valialkin
d50165ad59 app/vmagent: increase default value for -remoteWrite.queues from 1 to 4, since it has been appeared that many users hit this limit 2020-09-18 14:21:54 +03:00
Aliaksandr Valialkin
63d3c88c3b vendor: update github.com/valyala/quicktemplate from v1.6.2 to v1.6.3 2020-09-18 13:10:48 +03:00
Aliaksandr Valialkin
1a9ee39b0e lib/promscrape: avoid copying response body when scraping targets.
This should reduce memory usage when scraping targets with millions of metrics.
2020-09-18 13:05:43 +03:00
Aliaksandr Valialkin
70c721c01b lib/persistentqueue: flush data to disk every second
Previously small amounts of data may be left unflushed for extended periods of time if vmagent collects small amounts of data.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/687
2020-09-18 13:05:40 +03:00
Aliaksandr Valialkin
74e3198281 vendor: udpate github.com/VictoriaMetrics/fasthttp from v1.0.5 to v1.0.7 2020-09-18 12:20:29 +03:00
Aliaksandr Valialkin
98d1cd0971 app/vmselect/graphite: return proper results /metrics/find?query=foo.*.bar according to Graphite Metrics API 2020-09-18 11:00:00 +03:00
Aliaksandr Valialkin
7a134b0fd7 app/vmstorage: added -forceMergeAuthKey command-line flag for protecting /internal/force_merge endpoint 2020-09-17 14:21:53 +03:00
Aliaksandr Valialkin
1f33dd717f lib/storage: add /internal/force_merge handler for running forced compactions on historical per-month partitions
This may be useful for freeing up storage space after time series deletion.

See https://victoriametrics.github.io/#force-merge for more details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686
2020-09-17 12:20:40 +03:00
Aliaksandr Valialkin
8beb0da6ad lib/{mergeset,storage}: compare errors with errors.Is() 2020-09-17 03:03:02 +03:00
Aliaksandr Valialkin
067d7c1ea1 lib/{mergeset,storage}: code prettifying 2020-09-17 02:06:31 +03:00
Aliaksandr Valialkin
020bd8685e lib/storage: removed duplicate checks for empty parts during merge - another check is in the beginning of mergeParts functions 2020-09-17 01:49:03 +03:00
Aliaksandr Valialkin
f2a449983d vendor: make vendor-update 2020-09-17 01:43:19 +03:00
Aliaksandr Valialkin
8674963f6a docs/Single-server-VictoriaMetrics.md: document that /api/v1/series/count may count delete time series
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/770
2020-09-17 01:38:17 +03:00
Aliaksandr Valialkin
ab53cb6f7b app/vmagent: substitute -remoteWrite.url with secret-url value in logs, since it may contain sensitive info such as passwords or auth tokens
Pass `-remoteWrite.showURL` command-line flag in order to see real `-remoteWrite.url` values in logs and at `/metrics` page.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/773
2020-09-16 22:36:25 +03:00
Aliaksandr Valialkin
9f79bcf64a app/vmselect: improve description for -search.maxQueryDuration 2020-09-16 21:15:41 +03:00
Aliaksandr Valialkin
39dee12ed7 lib/persistentqueue: code simplification after d455764a6f 2020-09-16 21:14:19 +03:00
Aliaksandr Valialkin
d455764a6f lib/persistentqueue: make the persistent queue more durable against unclean shutdown (kill -9, OOM, hard reset)
The strategy is:

- Periodical flushing of inmemory blocks to files, so they aren't lost on unclean shutdown.
- Periodical syncing of metadata for persisted queues, so the metadata remains in sync with the persisted data.
- Automatic adjusting of too big chunk size when opening the queue. The chunk size may be bigger than the writer offset after unclean shutdown.
- Skipping of broken chunk file if it cannot be read.
- Fsyncing finalized chunk files.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/687
2020-09-16 18:13:44 +03:00
Aliaksandr Valialkin
ffadf035fa lib/protoparser/vmimport: add more testcases for invalid timestamps and values
Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 02:22:06 +03:00
Aliaksandr Valialkin
d8183c3124 lib/protoparser: report more errors for incorrect timestamps and/or values
Previously certain errors in timestamps and/or values could be silently skipped,
which could lead to samples with zero values stored in the database.

Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 02:14:18 +03:00
Aliaksandr Valialkin
9bc8484ab6 lib/protoparser/graphite: return error when value or timestamp cannot be properly parsed
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/99
2020-09-16 01:35:12 +03:00
Aliaksandr Valialkin
26fa94ba8d vendor: update github.com/valyala/fastjson from v1.5.4 to v1.5.5
This should properly parse `+Inf` values when importing JSON lines via `/api/v1/import`

Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 00:07:56 +03:00
Aliaksandr Valialkin
0bccb58e80 docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics ignores NaN and Inf values during data ingestion
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-15 23:40:28 +03:00
Aliaksandr Valialkin
1fec47a289 app/vmselect/netstorage: reduce memory usage when the time range from query touches big number of samples per each time series 2020-09-15 21:08:28 +03:00
Aliaksandr Valialkin
8c3d7c1a59 app/vmselect: typo fix in -search.maxStalenessInterval description 2020-09-15 14:24:27 +03:00
Aliaksandr Valialkin
fa01169c3d lib/promscrape: add a link to troubleshooting docs to error message when duplicate scrape target with identical labels is skipped 2020-09-15 14:16:05 +03:00
Aliaksandr Valialkin
51598bd718 docs/Articles.md: add a link to https://medium.com/miro-engineering/prometheus-high-availability-and-fault-tolerance-strategy-long-term-storage-with-victoriametrics-82f6f3f0409e 2020-09-15 12:29:10 +03:00
Aliaksandr Valialkin
ba74d0c14c lib/promscrape: typo fix 2020-09-12 00:14:21 +03:00
Aliaksandr Valialkin
7d893a234c lib/promscrape: do not reset the remaining rows when pushing a part of data to remote storage during big scrapes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/753

Thanks to @PerGon and @clmssz for help with debugging.
2020-09-11 23:39:13 +03:00
Aliaksandr Valialkin
0e533d1a9c app/vmselect/promql: support composite durations like Prometheus 2.21 does
The following durations are supported now: `1h5m35s` or `1s543ms`

See https://github.com/prometheus/prometheus/releases/tag/v2.21.0
and https://github.com/prometheus/prometheus/pull/7713
2020-09-11 23:39:13 +03:00
Aliaksandr Valialkin
0e19f35af5 lib/promscrape/discovery/dns: add __meta_dns_srv_record_target and __meta_dns_srv_record_port labels
This syncs dns service discovery with Prometheus 2.21 - see https://github.com/prometheus/prometheus/releases
and https://github.com/prometheus/prometheus/pull/7678 .
2020-09-11 23:39:13 +03:00
Roman Khavronenko
6ad6480400 vmalert: add Group name as label to generated alerts and timeseries (#761)
Solves #611
2020-09-11 20:52:56 +01:00
Roman Khavronenko
4cdffb04a4 vmalert: update groups on config reload only if changes detected (#759)
On config reload event `vmalert` reloads configuration for every group. While
it works for simple configurations, the more complex and heavy installations may
suffer from frequent config reloads.
The change introduces the `checksum` field for every group and is set to md5 hash
of yaml configuration. The checksum will change if on any change to group
definition like rules order or annotation change. Comparing the `checksum` field
on config reload event helps to detect if group should be updated.
The groups update is now done concurrently, so reload duration will be limited by
the slowest group now.

Partially solves #691 by improving config reload speed.
2020-09-11 20:14:30 +01:00
Aliaksandr Valialkin
ca856284e4 app/vmagent: allow setting multiple identical -remoteWrite.url values
This may be useful when each url is authenticated via different `-remoteWrite.basicAuth.username`.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/755
2020-09-11 15:17:22 +03:00
Aliaksandr Valialkin
62fde80490 lib/protoparser/common: do not read request body when parsing timestamp query arg
This was preventing from reading data via /api/v1/prometheus/import .

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/750
2020-09-11 14:44:58 +03:00
Aliaksandr Valialkin
5a90a92378 lib/storage: do not store inf values, since they may lead to significant precision loss for previously stored values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-11 14:44:53 +03:00
Aliaksandr Valialkin
a2f647d142 app/vmselect/prometheus: typo fix in the description for -search.latencyOffset command-line flag 2020-09-11 14:16:46 +03:00
Aliaksandr Valialkin
f95eea60d1 lib/protoparser: accept timestamp in milliseconds instead of seconds at /api/v1/import/prometheus
This improves consistency with timestamps in Prometheus text exposition format
2020-09-11 14:04:46 +03:00
Aliaksandr Valialkin
2380e9b017 app/{vminsert,vmagent}: allow passing timestamp via timestamp query arg when ingesting data to /api/v1/import/prometheus
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/750
2020-09-11 13:27:14 +03:00
Aliaksandr Valialkin
f0005c3007 app/vmselect: move Deadline from netstorage to searchutils
This removes dependency on netstorage from searchutils.
2020-09-11 13:27:13 +03:00
Aliaksandr Valialkin
2114179e19 app/vmselect: substitute inf values at smooth_exponential with the previous values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/757
2020-09-11 12:24:14 +03:00
Nikolay Khramchikhin
6c80ae0da8 Added endpointslices discovery to k8s api (#760)
This is similar to https://github.com/prometheus/prometheus/pull/6838 , which will be added in Prometheus v2.21.
See https://github.com/prometheus/prometheus/releases/tag/v2.21.0-rc.1

* Added endpointslices discovery to k8s api

Started from 1.17 k8s version endpointslices is beta,
it allows to query k8s api for endpoints more efficient.
It presents at scrape_config.yaml as separate role for kubernetes_sd_config.
kubernetes_sd_config:
- role: endpointslices

* fixed typos, changed EndpointConditions signature - with values instead of pointers
2020-09-11 12:16:45 +03:00
Aliaksandr Valialkin
204ec415b4 app/vmselect: skip infinite values when calculating smooth_exponential
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/757
2020-09-11 11:29:58 +03:00
Aliaksandr Valialkin
8a8b5a73d3 app/vmselect/graphite: typo fix in label name for vm_request_duration_seconds metric 2020-09-11 01:58:28 +03:00
John Belmonte
c9d0905b17 fix typo on outliersk() doc (#758) 2020-09-11 00:55:53 +03:00
Aliaksandr Valialkin
f6bc608e86 app/vmselect: initial implementation of Graphite Metrics API
See https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api
2020-09-11 00:30:01 +03:00
Aliaksandr Valialkin
3eccecd5fd vendor: make vendor-update 2020-09-10 09:49:13 +03:00
Aliaksandr Valialkin
b3dcaf0cd7 deployment/docker: update Go builder from v1.15.1 to v1.15.2
This fixes the following issues in Go runtime - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.2+label%3ACherryPickApproved
2020-09-10 09:36:43 +03:00
Aliaksandr Valialkin
9d8fdff6c5 lib/storage: reuse timestamp blocks for adjancent metric blocks with identical timestamps
This should reduce disk space usage when scraping targets containing metrics with identical names
such as `node_cpu_seconds_total`, histograms, quantiles, etc.

Expose `vm_timestamps_blocks_merged_total` and `vm_timestamps_bytes_saved_total` metrics for monitoring
the effectiveness of timestamp blocks merging.
2020-09-09 23:59:32 +03:00
Aliaksandr Valialkin
d7c04db1fc docs: sync docs for vmalert, vmauth, vmbackup and vmrestore 2020-09-09 21:10:34 +03:00
Aliaksandr Valialkin
e5ed8c8d75 docs/Articles.md: add links to recently published third-party articles and talks about VictoriaMetrics 2020-09-09 20:15:27 +03:00
Aliaksandr Valialkin
9d431a4b45 docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-09 01:21:45 +03:00
Aliaksandr Valialkin
4739dff6f0 docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-09 00:59:37 +03:00
Aliaksandr Valialkin
11eaa37111 docs/vmagent.md: clarified the case when -remoteWrite.queues must be tuned
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/745
2020-09-08 20:15:27 +03:00
Aliaksandr Valialkin
df169b1ebd lib/httpserver: add a jitter to connection timeouts in order to protect from Thundering herd problem 2020-09-08 19:55:09 +03:00
Aliaksandr Valialkin
9d61d24142 vendor: make vendor-update 2020-09-08 15:20:01 +03:00
Aliaksandr Valialkin
62919eaf7e app/vmselect/promql: go fmt 2020-09-08 15:19:59 +03:00
Aliaksandr Valialkin
e6da63dffe app/vmselect/promql: adjust integrate() calculations to be more similar to calculations from InfluxDB: attempt #2
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/701
2020-09-08 14:35:50 +03:00
Aliaksandr Valialkin
8e85b56737 app/vmselect/promql: adjust integrate() calculations to be more similar to calculations from InfluxDB
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/701
2020-09-08 14:23:39 +03:00
Aliaksandr Valialkin
c0343a661b app/vmselect/promql: increase floating point calculations accuracy by dividing by 1e3 instead of multiplying by 1e-3 2020-09-08 14:00:47 +03:00
Aliaksandr Valialkin
1bca6160a3 docs/Single-server-VictoriaMetrics.md: make docs-sync 2020-09-07 21:58:06 +03:00
John Belmonte
ccfb7c5e29 revise /api/v1/series docs (#746)
* revise /api/v1/series docs

Further clarification for #735

  * clarify how default range differers from Prometheus API
  * avoid `start=0` suggestion when confirming delete, because it will cause a timeout in most deployments

* Update README.md
2020-09-07 21:57:34 +03:00
Nikolay Khramchikhin
8d71a60a76 Changed s3 configProfile flag default, (#749)
aws sdk has complicated logic for chosing profile name and we shouldn't set
it to `default` value. It leads to bugs and improper configuration.
Set it to empty value by default is safe. It will be automatically set to `default` by sdk.
2020-09-07 21:53:24 +03:00
Aliaksandr Valialkin
eb33a48b9b docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-09-04 03:30:05 +03:00
John Belmonte
cd7426be6e document minScrapeInterval semantics (#744)
* document `minScrapeInterval` semantics

Fixes #714.

* Update README.md

revise wording
2020-09-04 03:29:26 +03:00
Aliaksandr Valialkin
a5621b9c46 docs/Single-server-VictoriaMetrics.md: updates according to review comments at fe98ba5a60 2020-09-04 02:57:02 +03:00
Aliaksandr Valialkin
be6ae4b5e7 lib/memory: fall back to reading hierarchical memory limit in cgroups when the default limit isn't set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/699
2020-09-04 00:05:05 +03:00
Aliaksandr Valialkin
d387da142e lib/httpserver: add -http.connTimeout command-line flag for limiting the lifetime for incoming http connections
This can be useful for balancing incoming connections among multiple services.
2020-09-03 22:23:29 +03:00
Aliaksandr Valialkin
e1c2757f70 vendor: update github.com/VictoriaMetrics/metricsql from v0.4.3 to v0.5.1
The new version of the package supports binary operations on string literals:

    * "foo" + "bar"     => "foobar"
    * "foo" == "bar"    => NaN
    * "foo" == "foo"    => 1
    * "foo" >bool "bar" => 1
    * "foo" < "bar"     => NaN

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/717
2020-09-03 16:33:31 +03:00
Aliaksandr Valialkin
f4e7e5fb90 app/vmselect/promql: add count_le_over_time(m[d], le) and count_gt_over_time(m[d], gt) functions
These functions returns the number of raw samples that don't exceed `le` or are bigger than `gt`.
These functions are complement to already existing `share_le_over_time(m[d], le)` and `share_gt_over_time(m[d], gt)`.
2020-09-03 15:29:10 +03:00
Aliaksandr Valialkin
d5b985f086 vendor: update github.com/VictoriaMetrics/metricsql from v0.4.1 to v0.4.2
The new version of this package properly supports escaped identifiers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/743
2020-09-03 15:01:42 +03:00
Aliaksandr Valialkin
e706e59d49 app/vmselect: unconditionally align time range boundaries to step for subqueries as Prometheus does 2020-09-03 13:29:50 +03:00
Aliaksandr Valialkin
fe98ba5a60 docs/Single-server-VictoriaMetrics.md: mention that /api/v1/series returns series for the last 5 minutes if start query arg is missing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/735
2020-09-03 12:38:29 +03:00
Aliaksandr Valialkin
ddabc13796 app/vmagent: properly flush big blocks of data
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/741

Thanks to @IceRain00 for the investigation and initial attempt to fix the issue
at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/742
2020-09-03 12:12:39 +03:00
Aliaksandr Valialkin
7a839b461f app/vmagent: fix data race when accessing writeRequest.lastFlushTime 2020-09-03 12:12:37 +03:00
Nikolay Khramchikhin
764b3d4fda changed vmalert behaviour (#738)
* VMAlert start with empty rules dir

There are some applications (operator for instance), that generates alerts configuration at runtime
and vmalert must start correctly without rules to support this behaviour.
Later application will add rules files and send SIGHUP to vmalert,
which will trigger reading rules files and start rules exectuion.

Removing rules files with SIGHUP signal must stop rules execution and
vmalert will wait for new rules.

* imports sorted

* added test cases for empty rules, removed blank line

* fixed imports conflict

* updated tests
2020-09-03 11:04:42 +03:00
Aliaksandr Valialkin
b4afc6ee2f docs/Single-server-VictoriaMetrics.md: add missing link to Prometheus text exposition format 2020-09-03 01:10:11 +03:00
Aliaksandr Valialkin
5f16ceb294 app/vmalert: imrovements over 3f932c2db1 2020-09-03 01:00:55 +03:00
DexterZhang
3f932c2db1 feat: spread load of rule evaluation by group when starting new groups (#724)
* feat: spread load of rule evaluation by group when starting new groups

* review: reduce the resulting diff.

* Update app/vmalert/group.go

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-03 00:58:54 +03:00
Aliaksandr Valialkin
f41b36bb9a app/{vminsert,vmagent}: allow adding extra labels when importing data via Prometheus, CSV and JSON line formats
Extra labels may be added to the imported data by passing `extra_label=name=value` query args.
Multiple query args may be passed in order to add multiple extra labels.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/719
2020-09-02 19:43:21 +03:00
Aliaksandr Valialkin
038358b777 lib/promscrape: use the number of parsed rows as a basis for writeRequestCtxPool leveling
The previous basis on `cap(sw.labels)` doesn't work anymore after 7785869ccc ,
because `sw.labels` may be reset multiple times when processing big number of rows.
2020-09-02 18:46:01 +03:00
Roman Khavronenko
ed899ca9e8 Single dashboards update (#736)
* dashboard: rename var `datasource` to `ds` for consistency reason

Dasbhoards for cluster version or vmagent operate with datasource variable
named `ds`. For consistency sake we rename this variable in single node version
as well.

* dashboard: add instance variable picker

See dashboard reviews here https://grafana.com/grafana/dashboards/10229/reviews

* dashboard: limit number of buckets in histogram to 12 for vmagent dashboard

* dashboard: bump version requirement in description for single version

* dashboard: drop extra series override for single version

* dashboard: set Y-min to zero for most of panels in vmagent dashboard
2020-09-02 15:16:40 +03:00
Aliaksandr Valialkin
e9196655dd deployment/docker: update Go builder from v1.15.0 to v1.15.1 2020-09-02 15:10:15 +03:00
Aliaksandr Valialkin
821df709d3 vendor: make vendor-update 2020-09-02 15:05:16 +03:00
John Belmonte
67277abecf use Y-min 0 on Grafana dashboard graphs (#732) 2020-09-01 19:56:56 +01:00
Aliaksandr Valialkin
c2ff8de456 lib/httpserver: add -http.idleConnTimeout command-line flag for tuning the timeout for incoming idle http connections 2020-09-01 15:33:24 +03:00
Aliaksandr Valialkin
b059f194e4 lib/promscrape: fix applying sample_limit when scraping targets with big number of metrics
This has been broken at 7785869ccc
2020-09-01 11:08:13 +03:00
Aliaksandr Valialkin
7785869ccc lib/promscrape: reduce memory usage when scraping targets with millions of metrics
This should help when scraping /federate endpoints from Prometheus instances,
which scrape millions of metrics. See https://prometheus.io/docs/prometheus/latest/federation/
2020-09-01 10:57:07 +03:00
Aliaksandr Valialkin
5af777469a app/vmagent: log unsuccessful attempt number when sending data to -remoteWrite.url 2020-08-30 21:40:22 +03:00
Aliaksandr Valialkin
2149733bd2 app/vmagent: apply sane limits to -remoteWrite.queues
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/707
2020-08-30 21:25:37 +03:00
Aliaksandr Valialkin
dd20784d06 docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics accepts relative times at time, start and end query args 2020-08-28 10:13:16 +03:00
Aliaksandr Valialkin
de6970e828 docs/vmalert.md: sync with app/vmalert/README.md via make docs-update 2020-08-28 09:51:48 +03:00
Aliaksandr Valialkin
4a415620d3 docs/Articles.md: add a link to https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-filtering-and-modifying-time-series-6d40cea4bf21 2020-08-28 09:51:26 +03:00
Aliaksandr Valialkin
acbcad1ece lib/{promscrape,leveledbytebufferpool}: rename getPoolIdAndCapacity to getPoolIDAndCapacity in order to make golint happy 2020-08-28 09:49:32 +03:00
Aliaksandr Valialkin
f4c4ab811b lib/cgroup: limit the maximum GOMAXPROCS value to the number of available CPU cores
There is no sense in setting GOMAXPROCS to value higher than the number of available CPU cores.
2020-08-28 09:49:32 +03:00
Roman Khavronenko
10601bc652 vmalert: update -rule flag description to enforce quotes using (#709)
Description for `-rule` flag uses as example specific chars like asterisks
which could be interpreted wrong by different shells. To avoid this, description
now contains quoted flag values.

See also #708
2020-08-20 22:36:38 +01:00
Roman Khavronenko
f2c004d1ae lib/flagutil: avoid int overflow for arch 386 (#710)
Arch 386 is a 32-bit architecture and interprets int type for numbers as an explicit int32,
whereas on most modern CPUs int is implicitly an int64. This makes tests to fail with
`int overflow` error.
2020-08-20 22:27:37 +01:00
Aliaksandr Valialkin
efc730863b lib/promscrape: reduce memory usage when scraping targets with big number of metrics alongside targets with small number of labels
Previously targets with big number of metrics and/or labels could generated too big buffers,
which then could be re-used when scraping targets with small number of metrics.
This resulted in memory waste.

Now big buffers are used only for targets with big number of metrics / labels,
while small buffers are used for targets with small number of metrics / labels.
2020-08-16 22:29:51 +03:00
Aliaksandr Valialkin
d6967319b6 lib/leveledbytebufferpool: allocate byte buffers with capacity rounded to the upper boundary for the given bucket
This should reduce the number of resizings for the returned byte buffers.
2020-08-16 22:13:30 +03:00
Roman Khavronenko
f5f59896ec lib/decimal: rename significant decimal digits to significant figures (#698)
The previous notion was inconsistent with what `decimal.Round` does.
According to [wiki](https://en.wikipedia.org/wiki/Significant_figures) rounding
applied to all significant figures, not just decimal ones.
2020-08-16 17:21:35 +03:00
Aliaksandr Valialkin
147c35ebd4 all: allow using KB, MB, GB, KiB, MiB and GiB suffixes in command-line flag values related to byte sizes or byte rates 2020-08-16 17:05:52 +03:00
Aliaksandr Valialkin
7c0d6a8b88 lib/memory: improve log message about the memory allowed to use by VictoriaMetrics 2020-08-16 16:04:11 +03:00
Aliaksandr Valialkin
ed00eb3f33 lib/protoparser: removed unnecessary call to SetReadDeadline when reading a stream of data
The OS should return any buffered data in the stream without the need to set the read timeout.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-15 15:38:08 +03:00
Aliaksandr Valialkin
7615a3ab8d vendor: upgrade github.com/valyala/gozstd from v1.7.1 to v1.8.3 2020-08-15 15:11:56 +03:00
Aliaksandr Valialkin
7be9bedaf9 vendor: downgrade github.com/valyala/gozstd from v1.8.1 to v1.7.1 until https://github.com/facebook/zstd/issues/2222 is fixed 2020-08-15 14:46:32 +03:00
Aliaksandr Valialkin
00b1659dde lib: dump compressed block contents on error during decompression
This should improve detecting root cause for https://github.com/facebook/zstd/issues/2222
2020-08-15 14:44:33 +03:00
Aliaksandr Valialkin
528e25bdde vendor: update github.com/valyala/gozstd from v1.7.0 to v1.8.1 2020-08-15 13:46:43 +03:00
Aliaksandr Valialkin
b3849a90fd lib/leveledbytebufferpool: pre-allocate byte slice with the given capacity if the pool is empty
This should reduce memory allocations and copying when the byte slice is growing.
2020-08-15 01:40:54 +03:00
Aliaksandr Valialkin
7d89fafe1a app/vmselect/promql: allow passing multiple args to aggregate functions such as avg(q1, q2, q3) 2020-08-15 01:15:09 +03:00
Aliaksandr Valialkin
cd96248480 docs/vmagent.md: mention that gaps in remote storage may appear if vmagent cannot keep up with data ingestion 2020-08-14 20:47:57 +03:00
Aliaksandr Valialkin
7554be172d lib/protoparser: move common code for detecting timeouts to ReadLinesBlockExt
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-14 20:40:15 +03:00
Aliaksandr Valialkin
4beab7ad39 lib/protoparser: prevent from busy loop on repeated timeout errors when reading streams of ingested data
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-14 20:14:11 +03:00
Aliaksandr Valialkin
41d23f84ed docs/Cluster-VictoriaMetrics.md: sync with upstream 2020-08-14 19:15:29 +03:00
Aliaksandr Valialkin
184670fb9b docs: update docs 2020-08-14 19:13:42 +03:00
Aliaksandr Valialkin
52791fd1c0 lib/memory: add -memory.allowedBytes command-line flag for setting absolute memory limit for VictoriaMetrics caches 2020-08-14 19:13:38 +03:00
Aliaksandr Valialkin
576da0fe46 app/{vminsert,vmagent}: improve documentation for -influxListenAddr command-line flag 2020-08-14 18:04:44 +03:00
Aliaksandr Valialkin
215967437d lib/protoparser/prometheus: typo fix in error message 2020-08-14 11:04:23 +03:00
Aliaksandr Valialkin
d1ad3adcbe vendor: make vendor-update 2020-08-14 02:29:02 +03:00
Aliaksandr Valialkin
42960feff4 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.4 to v1.0.5 2020-08-14 02:19:36 +03:00
Aliaksandr Valialkin
07246bc31c vendor: update github.com/klauspost/compress from v1.10.10 to v1.10.11 2020-08-14 02:17:07 +03:00
Aliaksandr Valialkin
e646674b23 lib/promscrape: use a hint on body length instead of body capacity
This should reduce memory usage for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/689
2020-08-14 01:17:52 +03:00
Aliaksandr Valialkin
4628deecd1 lib/promscrape: reduce memory usage when scraping big number of targets
Thanks to @dxtrzhang for the original idea at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/688

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/689
2020-08-14 01:04:53 +03:00
Aliaksandr Valialkin
eead3ee8ec lib/promscrape: properly retry requests on the server closed connection before returning the first response byte error during service discover API calls and target scrapes 2020-08-13 22:31:52 +03:00
Aliaksandr Valialkin
c402265e88 all: support %{ENV_VAR} placeholders in yaml configs in all the vm* components
Such placeholders are substituted by the corresponding environment variable values.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/583
2020-08-13 17:15:25 +03:00
Aliaksandr Valialkin
ff495a74f6 deployment/docker: update Go builder from Go1.14.7 to Go1.15.0 2020-08-13 15:53:32 +03:00
Aliaksandr Valialkin
45962fb8c2 docs/Cluster-VictoriaMetrics.md: mention about Kubernetes operator 2020-08-12 21:15:34 +03:00
Aliaksandr Valialkin
fd6c690276 docs/Single-server-VictoriaMetrics.md: mention helm charts, k8s operator and vmctl tool in Integrations chapter 2020-08-12 21:12:23 +03:00
Aliaksandr Valialkin
e730788477 docs/Articles.md: added https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043 2020-08-12 21:03:05 +03:00
Aliaksandr Valialkin
ef7e2af8f5 app: respect CPU limits set via cgroups
Update GOMAXPROCS to limits set via cgroups. This should reduce CPU trashing and reduce memory usage
for cases when VictoriaMetrics components run in containers with CPU limits.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685
2020-08-11 22:59:19 +03:00
Aliaksandr Valialkin
15aa6142ef lib/protoparser: clarify that the string passed to Unmarshal() function must remain available when the parsed rows are in use 2020-08-11 17:04:39 +03:00
Aliaksandr Valialkin
5492edcc6c docs/Single-server-VictoriaMetrics.md: mention that it is safe to skip multiple versions during the upgrade 2020-08-11 14:21:37 +03:00
Aliaksandr Valialkin
e969ef2639 app/vmselect: reduce memory usage when exporting time series with big number of samples via /api/v1/export if max_rows_per_line is set to non-zero value
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685
2020-08-10 20:57:36 +03:00
Aliaksandr Valialkin
c098988a18 lib/protoparser/influx: accept precision=us and precision=µ according to https://docs.influxdata.com/influxdb/v1.8/tools/api/#write-http-endpoint 2020-08-10 20:23:26 +03:00
Aliaksandr Valialkin
1bdfa29ef7 lib/promscrape: optimize per-metric hash calculations
This increases vmagent performance by up to 10% when scraping big number of metrics
2020-08-10 19:49:03 +03:00
Aliaksandr Valialkin
8adba82c02 app/vmselect/netstorage: vary batch size for data unpacking depending on the available CPU cores
This should reduce contention on the channel with unpack work for systems with high number of CPU cores
2020-08-10 15:16:42 +03:00
Aliaksandr Valialkin
8d9eb5f808 lib/storage: mention time range used in the query that led to error message
This should improve detecting slow queries with too big time ranges
2020-08-10 13:46:36 +03:00
Aliaksandr Valialkin
582c74cd93 lib/storage: mention tag filters used in the query that led to error message
This should improve detecting invalid or heavy queries that lead to errors.
2020-08-10 13:36:49 +03:00
Aliaksandr Valialkin
f3d33e23c9 app/vmstorage: improve error logging when the request times out 2020-08-10 13:23:26 +03:00
Aliaksandr Valialkin
455bf50a91 lib/promscrape: show real timestamp and real duration for the scape on /targets page
Previously the scrape duration may be negative when calculated scrape timestamp drifts away from the real scrape timestamp
2020-08-10 12:40:25 +03:00
Aliaksandr Valialkin
2791008e19 vendor: make vendor-update 2020-08-09 15:13:55 +03:00
Aliaksandr Valialkin
a499de45cc lib/promscrape: make errcheck happy 2020-08-09 13:17:18 +03:00
Aliaksandr Valialkin
23c9e6b727 lib/promscrape: export scrape_samples_added per-target metric like Prometheus does
This metric may be useful for detecting targets with high churn rate for the exported metrics.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/683
2020-08-09 12:45:39 +03:00
Aliaksandr Valialkin
9d32fb1d9e lib/fs: use WARN instead of ERROR log level for the message when NFS diretory removal temporarily fails
this is expected condition, so it is better to use WARN log level for it
2020-08-09 12:07:30 +03:00
Aliaksandr Valialkin
d4b6d22987 lib/promscrape: add a test for scrape config for blackbox exporter
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/684
2020-08-09 12:02:48 +03:00
Roman Khavronenko
0be5b09fb4 app/vmalert: extend metrics set exported by vmalert #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573

New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
 ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;

Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.

Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13

* app/vmalert: extend metrics set exported by `vmalert` #573

The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.

The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 09:41:29 +03:00
ofen
81746d14b9 401 Unauthorize HTTP error added (#681)
401 Unauthorize HTTP error added to trigger browser credentials pop-up promt [RFC 7235 https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication]
2020-08-09 09:38:41 +03:00
Aliaksandr Valialkin
807c2b076c vendor: update github.com/VictoriaMetrics/metrics from v1.12.2 to v1.12.3 2020-08-07 13:02:51 +03:00
Aliaksandr Valialkin
84fd8af6d3 lib/storage: slow down concurrent searches when the number of concurrent inserts reaches the limit
This should improve data ingestion performance when heavy searches are executed

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/618
2020-08-07 08:49:40 +03:00
Aliaksandr Valialkin
9043a509a3 lib/storage: properly check timeouts and pace limits
Previously they were checked on every iteration for small number of iterations
2020-08-07 08:40:37 +03:00
Aliaksandr Valialkin
1ad3de5c54 deployment/docker: update Go builder from v1.14.6 to v1.14.7 2020-08-07 08:29:06 +03:00
Aliaksandr Valialkin
d60908bba4 docs/MetricsQL.md: mention that MetricsQL removes all the NaN values from results 2020-08-07 07:51:45 +03:00
Aliaksandr Valialkin
716754fae6 app/vmselect/promql: properly handle -n^m like Prometheus does
`-n^m` must be handled as `-(n^m)` instead of `(-n)^m`.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/675
2020-08-07 07:42:18 +03:00
Aliaksandr Valialkin
bb61a4769b app/vmselect/promql: remove metric name after applying clamp_min and clamp_max functions in order to be consistent with Prometheus
This improves VictoriaMetrics score at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:42:37 +03:00
Aliaksandr Valialkin
ac45082216 app/vmselect/promql: remove metric name after applying ceil, floor and round functions in order to be more consistent with Prometheus
This improves VictoriaMetrics score at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:34:37 +03:00
Aliaksandr Valialkin
e5202a4eae app/vmselect/promql: remove metric name from results of certain rollup functions in order to be consistent with Prometheus
Rollup functions:

  - avg_over_time
  - min_over_time
  - max_over_time
  - quantile_over_time

This improves VictoriaMetrics results at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:29:13 +03:00
Aliaksandr Valialkin
68e4f40a72 app/vmselect: properly handle PromQL queries like scalar1 < metric < scalar2 like Prometheus does
This fixes some cases from https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:21:03 +03:00
Aliaksandr Valialkin
ada2ae69ec vendor: update github.com/VictoriaMetrics/metricsql from v0.2.10 to v0.3.0
This adds support for special integers in MetricsQL that start from 0x, 0b, 0o.
This improves compatibility with PromQL - see https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 21:45:21 +03:00
Aliaksandr Valialkin
bc8381613d app/vmselect: reduce memory allocations by pre-allocatin memory for time series map and for a list of time series names 2020-08-06 19:17:58 +03:00
Aliaksandr Valialkin
8e44fba76d lib/storage: reduce the frequency (and overhead) for timeout and pace limiter checks by 4x 2020-08-06 18:45:55 +03:00
Aliaksandr Valialkin
7dbe335426 lib/pacelimiter: increase scalability for multi-CPU system 2020-08-06 18:32:59 +03:00
Aliaksandr Valialkin
3f85c06b65 app/vmselect/netstorage: reduce CPU contention when upacking time series blocks by unpacking batches of such blocks instead of a single block
This should improve query performance on systems with big number of CPU cores (16 and more)
2020-08-06 17:50:17 +03:00
Aliaksandr Valialkin
d20c2156e4 app/vmselect/netstorage: reduce contention on unpackworkCh and timeseriesWorkCh for multi-CPU system by providing more capacity for these chans 2020-08-06 17:22:48 +03:00
Aliaksandr Valialkin
ad730d8a17 lib/storage: optimize prefetching metric names for the given metricIDs 2020-08-06 16:53:10 +03:00
Aliaksandr Valialkin
dbbdfbe7ee app/vmstorage: rename vm_cache_size_entries{type="storage/prefetchedMetricIDs"} to vm_cache_entries{type="storage/prefetchedMetricIDs"} to be consistent with other vm_cache_entries metrics 2020-08-06 16:34:24 +03:00
Aliaksandr Valialkin
639b26b40c lib/fs: export vm_nfs_pending_dirs_to_remove metric for monitoring the number of pending directories that couldn't be removed due to NFS lock 2020-08-06 15:31:34 +03:00
Aliaksandr Valialkin
8f16388428 lib/storage: limit the number of concurrent calls to storage.searchTSIDs to GOMAXPROCS*2
This should limit the maximum memory usage and reduce CPU trashing on vmstorage
when multiple heavy queries are executed.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
2020-08-05 18:30:07 +03:00
Aliaksandr Valialkin
aaa497ff0b Perform conversion from string to []byte according to rule #6 at https://golang.org/pkg/unsafe/#Pointer 2020-08-05 11:55:58 +03:00
Aliaksandr Valialkin
ef94333808 vendor: make vendor-update 2020-08-05 11:10:10 +03:00
Aliaksandr Valialkin
c25b0c2cd5 app/vmagent: tune http client for sending data to remote storage in order to disable closing keep-alive connections
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/663
2020-08-04 21:00:29 +03:00
Aliaksandr Valialkin
5d0c37bec0 app/vmselect: use warning level instead of info level for logging slow queries that take longer than -search.logSlowQueryDuration 2020-08-04 20:25:35 +03:00
Antonin Kral
bba1442649 Add option to build 32b ARM Debian package (armhf) (#665) 2020-08-04 18:12:59 +03:00
Aliaksandr Valialkin
a9ffd233df docs/Single-server-VictoriaMetrics.md: add a chapter about data updates 2020-08-04 13:53:59 +03:00
Aliaksandr Valialkin
a034f02fb2 lib/backup: allow using ~/.aws/config without region
Use us-west-2 for determining bucket region.
2020-08-04 13:07:59 +03:00
Aliaksandr Valialkin
e6eee2bebf app/vmselect/promql: add zscore-related functions: zscore_over_time(m[d]) and zscore(q) by (...) 2020-08-03 21:52:18 +03:00
Aliaksandr Valialkin
509d12643b app/vmselect: show X-Forwarded-For contents on /api/v1/status/active_queries page
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/659
2020-07-31 20:05:18 +03:00
Aliaksandr Valialkin
5e71fab8a6 lib/storage: reduce the maximum number of concurrent merge workers to GOMAXPROCS/2
Previously the limit has been raised to GOMAXPROCS, but it has been appeared that this
increases query latencies since more CPUs are busy with merges.

While at it, substitute `*MergeConcurrencyLimitCh` channels with simple integer limits.
2020-07-31 17:46:56 +03:00
Aliaksandr Valialkin
d01f3c1943 all: add mssing APP_NAME to vm*-GOARCH builds 2020-07-31 13:42:18 +03:00
Aliaksandr Valialkin
3f498cf2dc docs/{vmagent,vmalert}: add instruction on how to build for ARM 2020-07-31 09:25:22 +03:00
Aliaksandr Valialkin
8c8c14c127 docs/Single-server-VictoriaMetrics.md: mention that downgrade is also safe to perform 2020-07-31 09:20:40 +03:00
Aliaksandr Valialkin
44a86e1be3 vendor: update github.com/valyala/quicktemplate from v1.5.2 to v1.6.0 2020-07-30 23:39:40 +03:00
Aliaksandr Valialkin
f0c678c41b app/vmselect: do not adjust start and end query args passed to /api/v1/query_range when -search.disableCache command-line flag is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/563
2020-07-30 23:14:37 +03:00
Aliaksandr Valialkin
e255c066cc docs/vmalert.md: sync with app/vmalert/README.md 2020-07-30 21:56:48 +03:00
Aliaksandr Valialkin
e7959094f6 lib/storage: remove prioritizing of merging small parts over merging big parts, since it doesn't work as expected
The prioritizing could lead to big merge starvation, which could end up in too big number of parts that must be merged into big parts.

Multiple big merges may be initiated after the migration from v1.39.0 or v1.39.1. It is OK - these merges should be finished soon,
which should return CPU and disk IO usage to normal levels.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/618
2020-07-30 19:57:27 +03:00
Aliaksandr Valialkin
922d9aadf2 lib/storage: properly update vm_slow_row_inserts_total metric when importing multiple data points per time series at once
Previously the `vm_slow_row_inserts_total` metric may be incremented multiple times for different data points per a single time series,
while only a single increment is needed when inserting the first data point for this time series.
2020-07-30 16:17:24 +03:00
Aliaksandr Valialkin
68716488db vendor: update github.com/valyala/quicktemplate from v1.5.1 to v1.5.2 2020-07-29 18:20:11 +03:00
Aliaksandr Valialkin
67a64c142d lib/httpserver: emit X-Forwarded-For additionally to remoteAddr in error logs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/659
2020-07-29 13:12:42 +03:00
Aliaksandr Valialkin
328b52e5ff app/vmselect/promql: return non-empty value from rate_over_sum(m[d]) even if a single data point is located in the given [d] window
Just divide the data point value by the window duration in this case.
2020-07-29 12:37:58 +03:00
Aliaksandr Valialkin
700737c181 app/vmselect/promql: remove rollupFuncArg.realPrevValue handling, since the corner case in increase() is handled in another way now
See e00cfc854d for the approach used now.
2020-07-29 12:37:58 +03:00
Aliaksandr Valialkin
2f735f112d app/vmselect/promql: fill gaps with 0 in rate_over_sum response when the last value before the selected time window isnt empty 2020-07-29 12:37:58 +03:00
Aliaksandr Valialkin
1ca0c8a29b vendor: make vendor-update 2020-07-29 09:36:08 +03:00
Aliaksandr Valialkin
d81d586b86 vendor: update github.com/VictoriaMetrics/metrics from v1.12.1 to v1.12.2 2020-07-28 22:02:29 +03:00
Aliaksandr Valialkin
0f63da3698 app/{vmagent,vminsert}: properly preserve db tag from query string passed to Influx line protocol query
Previously `db` tag from the query string wasn't added to metrics after encountering `db` tag in the Influx line

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/653
2020-07-28 21:25:19 +03:00
Aliaksandr Valialkin
62ed38c6f0 app/vmagent/remotewrite: add missing resp.Body.Close() after pushing data to remote storage
Missing body close could disable HTTP keep-alive connections.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/653
2020-07-28 21:00:15 +03:00
Aliaksandr Valialkin
79c30cf4cb app/vmselect: show query origin (aka remote_addr or client address) on the /api/v1/status/active_queries page for every query 2020-07-28 15:13:08 +03:00
Roman Khavronenko
2f1e7298ce app/vmalert: support external.label to specify global labelset for all rules #622 (#652)
`external.label` flag supposed to help to distinguish alert or recording rules
source in situations when more than one `vmalert` runs for the same datasource
or AlertManager.
2020-07-28 14:20:31 +03:00
Aliaksandr Valialkin
0da202023b app/vmselect/promql: return empty values from group() if all the time series have no values at the given timestamp
This aligns `group()` behaviour to Prometheus
2020-07-28 13:40:11 +03:00
Aliaksandr Valialkin
48d0ec1363 docs/MetricsQL.md: small fixes in the docs 2020-07-28 13:27:37 +03:00
Aliaksandr Valialkin
a1a065a47e docs/Single-server-VictoriaMetrics.md: mention that OpenTSDB data ingestion protocol is used by KairosDB 2020-07-28 13:11:07 +03:00
Aliaksandr Valialkin
0516e3f330 vendor: update github.com/VictoriaMetrics/metrics from v1.12.0 to v1.12.1 2020-07-28 00:20:43 +03:00
Sasasu
5b81bdde39 lib/storage: metaindexRow use memroy more efficiently (#655)
due to memory align the metaindexRow structure use 64-byte pre object.
this commit changes the order of field, make metaindexRow use 56-byte pre
object.

Signed-off-by: Sasasu <su@sasasu.me>
2020-07-27 19:02:53 +03:00
Aliaksandr Valialkin
865610a7c8 lib/protoparser/prometheus: add a test for cassandra-exporter
Thanks to Seva
2020-07-27 18:37:11 +03:00
Aliaksandr Valialkin
cb8c6908dc app/vmagent/remotewrite: create new request on failure to send a block of data to remote storage
Previously the request body was already consumed before the retry, so this led to the following error:

    http: ContentLength=... with Body length 0
2020-07-27 17:32:46 +03:00
Aliaksandr Valialkin
894dcb7c1c app/vmselect/promql: improve further the accuracy of buckets_limit() function
The accuracy is increased by mergin the smallest bucket with the smallest adjacent bucket.
2020-07-26 12:10:13 +03:00
Aliaksandr Valialkin
215eba0b82 app/vminsert: flush bufs if needed after the current row is added
Previously the data for the added row could be overwritten by the flush
before the row addition is complete.
2020-07-26 12:10:11 +03:00
Aliaksandr Valialkin
edb1eca6f1 app/vmselect/promql: avoid dropping inf bucket in buckets_limit
The `le="inf"` bucket must be preserved in order to maintain the maximum level of accuracy.
2020-07-25 17:00:36 +03:00
Aliaksandr Valialkin
97b6f5d223 app/vmselect/promql: optimize buckets_limit(k, buckets) for big number of buckets 2020-07-25 13:24:03 +03:00
Aliaksandr Valialkin
a090627059 app/vminsert: limit memory usage when ingesting data in big packets 2020-07-24 23:32:40 +03:00
Aliaksandr Valialkin
53c87ba341 deployment/docker/docker-compose.yml: update Grafana version from 7.0.3 to 7.1.1 2020-07-24 18:43:37 +03:00
Aliaksandr Valialkin
bb161497cf app/vmselect/promql: improve the accuracy of buckets_limit(k, buckets) function
Now it properly merges the bucket with the previous bucket after deletion.
2020-07-24 17:07:49 +03:00
Aliaksandr Valialkin
994fa2f3bf app/vmselect/promql: add buckets_limit(k, buckets) function, which limits the number of buckets per time series to k
This function works with both Prometheus-style and VictoriaMetrics-style buckets.
The function removes buckets with the lowest values in order to reserve the highest precision.
The function is useful for building heatmaps in Grafana from too big number of buckets.
2020-07-24 16:13:53 +03:00
Aliaksandr Valialkin
e151c5c644 app/vmselect: fix tests for rate_over_sum 2020-07-24 02:35:28 +03:00
Aliaksandr Valialkin
3107c633e3 app/vmselect/promql: typo fix after 3e557c9861 2020-07-24 02:15:58 +03:00
Aliaksandr Valialkin
3e557c9861 app/vmselect/promql: add rate_over_sum(m[d]) function to MetricsQL, which returns rate over sum of m values over d duration
Something like `sum_over_time(m[d]) / d`, but more accurate.
2020-07-24 01:17:42 +03:00
Aliaksandr Valialkin
54ef2d8112 lib/storage: slightly reduce code difference between single-node and cluster versions 2020-07-24 00:31:16 +03:00
Aliaksandr Valialkin
b1f6843bd0 app/vmselect/promql: allow setting [d] window smaller than the interval between raw points for avg_over_time
This makes `avg_over_time` behavior consistent with `sum_over_time` and `count_over_time` behaviors.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/636
2020-07-23 22:25:43 +03:00
Aliaksandr Valialkin
039c9d2441 lib/storage: respect -search.maxQueryDuration when searching for time series in inverted index
Previously the time spent on inverted index search could exceed the configured `-search.maxQueryDuration`.
This commit stops searching in inverted index on query timeout.
2020-07-23 21:21:42 +03:00
Aliaksandr Valialkin
2a45871823 lib/storage: add more fine-grained pace limiting for search 2020-07-23 19:26:08 +03:00
Aliaksandr Valialkin
461481fbdf app/vmselect/netstorage: protect from too smart compiler, which may break memory usage optimization in ProcessSearchQuery 2020-07-23 17:54:01 +03:00
Aliaksandr Valialkin
4c8b49b193 app/vminsert: export vm_relabel_metrics_dropped_total metric that shows the number of metrics dropped due to relabeling 2020-07-23 14:57:53 +03:00
Aliaksandr Valialkin
e79de9774b app/vmselect: typo fix after 34563916f7 2020-07-23 14:12:28 +03:00
Aliaksandr Valialkin
34563916f7 app/vmselect: reduce memory usage when querying big number of time series with long labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/646
2020-07-23 13:53:52 +03:00
Aliaksandr Valialkin
9257eee982 app/vminsert: do not call ApplyRelabeling function if relabeling is disabled
This should reduce CPU usage a bit when `-relabelConfig` isn't set
2020-07-23 13:39:44 +03:00
Aliaksandr Valialkin
6f05c4d351 lib/storage: improve prioritizing of data ingestion over querying
Prioritize also small merges over big merges.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
2020-07-23 13:23:36 +03:00
Aliaksandr Valialkin
2f612e0c67 app/vminsert: fix relabeling for metrics ingested via Influx line protocol
Previously the enabled relabeling with `-relabelConfig` command-line flag could result in missing labels
if a single Influx line protocol message contains multiple field values.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/638
2020-07-23 13:23:14 +03:00
Aliaksandr Valialkin
61c611f5ad lib/storage: properly calculate global metrics in UpdateStats() 2020-07-23 00:35:15 +03:00
Aliaksandr Valialkin
9224ede54f lib/mergeset: properly calculate global metrics in UpdateStats()
Previously these metrics could be calculated multiple times for multiple mergeset.Table instances.
2020-07-23 00:35:13 +03:00
Aliaksandr Valialkin
228d137936 lib/storage: reorder mergeBlockStreams() args in order to make them more consistent 2020-07-22 21:58:10 +03:00
Aliaksandr Valialkin
e4303d3d21 lib/storage: prevent possible race condition when all the goroutines exit Storage.AddRows, before goroutines other goroutines are blocked on searchTSIDsCond inside Storage.searchTSIDs
This condition may occur after the following sequence of events:

1) A goroutine enters the loop body when len(addRowsConcurrencyCh) == cap(addRowsConcurrencyCh) inside Storage.searchTSIDs.
2) All the goroutines return from Storage.AddRows.
3) The goroutine from step 1 blocks on searchTSIDsCond.Wait() inside the loop body.

The goroutine remains blocked until the next call to Storage.AddRows, which calls searchTSIDsCond.Signal().
This may take indefinite time.
2020-07-22 21:52:34 +03:00
Aliaksandr Valialkin
ad8d3b387d docs/Single-server-VictoriaMetrics.md: mention that it is recommended inspecting logs during troubleshooting 2020-07-22 18:21:29 +03:00
Aliaksandr Valialkin
62e76ca805 vendor: make vendor-update 2020-07-22 16:54:44 +03:00
Aliaksandr Valialkin
4f526cc816 app/vmselect/prometheus: support d, w and y suffixes for durations passed to step in /api/v1/query_range like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/641
2020-07-22 16:26:18 +03:00
Aliaksandr Valialkin
dfb113f175 app/vmselect/netstorage: reduce memory allocations when unpacking time series data by using a pool for unpackWork entries
This should slightly reduce load on GC when processing queries that touch big number of time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/646 according to the provided memory profile there.
2020-07-22 15:03:57 +03:00
Aliaksandr Valialkin
31ae5911a8 app/vmagent: add -remoteWrite.decimalPlaces command-line flag, which may be used for reducing disk space usage on the remote storage 2020-07-21 21:55:32 +03:00
Aliaksandr Valialkin
d3442b40b2 lib/uint64set: optimize adding items to the set via Set.AddMulti 2020-07-21 20:56:59 +03:00
Aliaksandr Valialkin
caa2952aa6 app/vmselect: take into account the time spent in wait queue before query execution as time spent on the query 2020-07-21 19:00:09 +03:00
Aliaksandr Valialkin
e00cfc854d app/vmselect/promql: skip the first value in time series passed to increase() if it exceeds by more than 10x the delta between the next value and the first value
This should prvent from inflated `increase()` results for time series that start from big initial values.
Such cases may occur when a label value changes in a metric without counter reset.
2020-07-21 17:24:10 +03:00
Aliaksandr Valialkin
b9c8f6bf34 app/vmselect: log the total available memory for concurrent requests on not enough memory errors
This should simplify root cause analysis
2020-07-20 19:51:40 +03:00
Aliaksandr Valialkin
ad6290953c app/vmagent: add -remoteWrite.proxyURL command-line option
This option allows writing data to `-remoteWrite.url` via http, https or socks5 proxy.
This is similar to `proxy_url` option in `remote_write` section of Prometheus.
See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
2020-07-20 19:28:49 +03:00
Aliaksandr Valialkin
efcbb51968 docs/vmagent.md: sync with app/vmagent/README.md 2020-07-20 17:08:34 +03:00
Roman Khavronenko
ed0df37ee7 app/vmagent: mention grafana dashboard in README (#639) 2020-07-20 17:07:27 +03:00
Aliaksandr Valialkin
004d2924e2 vendor: update github.com/VictoriaMetrics/metrics from v1.11.3 to v1.12.0 2020-07-20 16:56:22 +03:00
Aliaksandr Valialkin
11be704109 app/vmagent/remotewrite: allow passing empty -remoteWrite.urlRelabelConfig entries 2020-07-20 15:49:27 +03:00
Aliaksandr Valialkin
5a4675c528 app/vmselect/prometheus: do not return time series with empty list of datapoints from /api/v1/query_range
This matches Prometheus behaviour.

This should fix https://github.com/jacksontj/promxy/issues/329
2020-07-20 15:31:21 +03:00
Aliaksandr Valialkin
ecb1b2564a app/vmselect/promql: add mode() aggregate function 2020-07-20 15:31:20 +03:00
Aliaksandr Valialkin
b35cb293f5 lib/httpserver: log remote address in error message from httpserver.Errorf
This should improve detection of the root cause of errors.
Thanks to Anant for the idea.
2020-07-20 14:11:22 +03:00
Aliaksandr Valialkin
1c641037e8 app/vmselect/promql: add mode_over_time(m[d]) function
See https://en.wikipedia.org/wiki/Mode_(statistics) and https://stackoverflow.com/questions/61134078/promql-query-to-return-the-value-from-a-range-vector-which-occurs-maximum-no-of
2020-07-17 18:28:45 +03:00
Aliaksandr Valialkin
6b5ad535ae app/vmselect/promql: optimize group(rollup(m)) calculations 2020-07-17 16:47:16 +03:00
Aliaksandr Valialkin
8949d65ad1 app/vmselect/promql: check that any() doesn't touch metric name 2020-07-17 16:23:21 +03:00
Aliaksandr Valialkin
3198fd31fa deployment/docker: update Go builder from v1.14.5 to v1.14.6
This fixes runtime issues found in Go since v1.14.5. See https://github.com/golang/go/issues?q=milestone%3AGo1.14.6+label%3ACherryPickApproved
2020-07-17 15:21:38 +03:00
Aliaksandr Valialkin
aa5d88055d app/vmselect/promql: add group() aggregate function to MetricsQL
This function has been added in Prometheus 2.20. See https://github.com/prometheus/prometheus/pull/7480
2020-07-17 15:17:55 +03:00
Aliaksandr Valialkin
df01836818 app/vmselect/promql: keep all labels for time series from any() call 2020-07-17 15:17:54 +03:00
Roman Khavronenko
dfa156e6aa vmagent: update grafana dashboard (#634)
* reference datasource variable instead of datasource name;
* change unit from `bytes` to `bits/s` for Network panel.
2020-07-17 02:11:20 +03:00
Aliaksandr Valialkin
8c14ca93fa app/vminsert/influx: properly handle the case when certain labels with empty values are removed by ApplyRelabeling() call
Previously this could lead to `out of range` panic
2020-07-17 00:07:06 +03:00
Aliaksandr Valialkin
e4e1cd1de2 app/vmselect: fix nil pointer dereference panic when unsuccessfully querying vmstorage 2020-07-16 19:15:43 +03:00
Aliaksandr Valialkin
ef6ee72108 deployment/docker: update Go builder from v1.14.4 to v1.14.5
This should fix the following issues in Go - https://github.com/golang/go/issues?q=milestone%3AGo1.14.5+label%3ACherryPickApproved
2020-07-16 18:55:09 +03:00
Aliaksandr Valialkin
ed7580ad22 app/vmalert: consistently use "%w" instead of "%s" in fmt.Errorf when wrapping errors 2020-07-15 13:56:47 +03:00
Roman Khavronenko
9eb71dda3d vmagent: add grafana dashboard (#629)
`vmagent` Grafana dashboard suppose to provide basic observability over multiple
`vmagent` instances. Dashboard is saved in Grafana export format so it can be easily
imported. It was also integrated into docker-compose environment.
2020-07-15 13:56:06 +03:00
Aliaksandr Valialkin
328814ee60 docs/vmagent.md: make filtering rules for init container pods less confusing 2020-07-14 20:32:47 +03:00
Aliaksandr Valialkin
7398e5701b vendor: make vendor-update 2020-07-14 20:31:42 +03:00
Aliaksandr Valialkin
4e770e9120 docs/Single-server-VictoriaMetrics.md: remove Roadmap chapter, since it became outdated 2020-07-14 19:06:33 +03:00
Aliaksandr Valialkin
b442a42d8e app/vmagent/remotewrite: return proper value from tssRelabelPool.New
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
2020-07-14 14:29:20 +03:00
Aliaksandr Valialkin
6d77bfae4f docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-07-14 14:19:14 +03:00
Aliaksandr Valialkin
4081e2295e app/{vminsert,vmagent}: add -influxSkipMeasurement command-line flag for using field name as metric name
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/626
2020-07-14 14:17:24 +03:00
Aliaksandr Valialkin
e1107fec10 lib/storage: reset MetricName->TSID cache after marking metricIDs as deleted
This is a follow-up commit after 12b16077c4 ,
which didn't reset the `tsidCache` in all the required places.
This could result in indefinite errors like:

    missing metricName by metricID ...; this could be the case after unclean shutdown; deleting the metricID, so it could be re-created next time

Fix this by resetting the cache inside deleteMetricIDs function.
2020-07-14 14:06:32 +03:00
Aliaksandr Valialkin
25f80d320b app/vmselect/prometheus: do not adjust last points in time series with timestamps exceeding the current time
Such timestamps usually mean that the query contains `offset`.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/625
2020-07-14 12:52:16 +03:00
Aliaksandr Valialkin
cde18d1f43 lib/protoparser: properly update vm_protoparser_rows_read_total{type="promscrape"} metric 2020-07-14 12:16:35 +03:00
Seva Poliakov
457e61900d add vm_protoparser_rows_read_total metrics to promscrape (#624)
* add vm_protoparser_rows_read_total metrics to promscrape

move vm_protoparser_rows_read_total for promscrape to better place

move vm_protoparser_rows_read_total for promscrape to better place

* remove possibility of infinity loop at prometheus parser
2020-07-14 12:16:34 +03:00
Roman Khavronenko
7e347972c4 lib/flagutil: specify additional description for all Array type flags (#620)
Array type flag is now defined as `value` type in flag description when printed.
This change adds additional description to every Array type flag so it would be
clear what exact type is used:
```
  -remoteWrite.urlRelabelConfig array
        Optional path to relabel config for the corresponding -remoteWrite.url
        Supports array of values separated by comma or specified via multiple flags.
```
2020-07-13 21:56:37 +03:00
Roman Khavronenko
19dd121968 lib/persistentqueue: add vm_persistentqueue_bytes_pending metric (#619)
Metric `vm_persistentqueue_bytes_pending` is a gauge that shows current amount
of bytes in persistentqueue flushed on disk as a difference between write and read
offsets. This metric is very similar to `vmagent_remotewrite_pending_data_bytes`
except of accounting for bytes in-memory.
2020-07-13 21:54:09 +03:00
Roman Khavronenko
829ec4f9cf Extend metric vm_promscrape_targets with status label (#615)
The change to `vm_promscrape_targets` metric suppose to improve observability
for `vmagent` so it will be possible to track how many targets are up or down
for every specific scrape group:
```
vm_promscrape_targets{type="static_configs", status="down"} 1
vm_promscrape_targets{type="static_configs", status="up"} 2
```
2020-07-13 21:52:03 +03:00
Aliaksandr Valialkin
55d83e777d app/vmselect/prometheus: minimize the diff for the change 1033dc7e2a over 619b0a25c9 2020-07-13 21:40:38 +03:00
faceair
1033dc7e2a fix empty response template (#617) 2020-07-13 21:31:19 +03:00
Aliaksandr Valialkin
619b0a25c9 docs/vmagent.md: sync with app/vmagent/README.md 2020-07-13 21:25:11 +03:00
ofen
666c795b98 Update README.md (#621)
Troubleshooting section updated to help out with duplicate targets detection
2020-07-13 21:18:54 +03:00
Aliaksandr Valialkin
a730b3f6a1 app/vmagent: fix data race when multiple -remoteWrite.urlRelabelConfig options are set
Previously multiple goroutines could access remoteWriteCtx.tss concurrently, which could lead to data race
and improper relabeling. Now each goroutine has its own copy of tss during relabeling.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
2020-07-10 15:16:59 +03:00
Aliaksandr Valialkin
508ad46e0e app/vmagent/remotewrite: typo fix in -remoteWrite.showURL help message 2020-07-10 14:07:08 +03:00
Aliaksandr Valialkin
e5b9f47623 vendor: update github.com/valyala/quicktemplate from v1.5.0 to v1.5.1
This should fix incorrect encoding for json strings with char codes below 0x20

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/613
2020-07-10 12:59:15 +03:00
Aliaksandr Valialkin
ca74b80f10 docs/Cluster-VictoriaMetrics.md: sync with the original README.md 2020-07-10 12:15:31 +03:00
Aliaksandr Valialkin
cba820e390 app/{vminsert,vmagent}: add ability to import data in Prometheus exposition format via /api/v1/import/prometheus 2020-07-10 12:14:07 +03:00
Aliaksandr Valialkin
6fe3c48a6e properly calculate readCalls 2020-07-10 12:00:58 +03:00
Aliaksandr Valialkin
9c350bc20d app/vmselect/promql: add missing tests for ifnot binary operation 2020-07-09 13:24:06 +03:00
Aliaksandr Valialkin
256fd9a87e app/vmselect/promql: refactor implementations for and and unless binary operations, so they are closer to or implementation 2020-07-09 13:05:55 +03:00
Aliaksandr Valialkin
2d9b3ad5b3 app/vmselect/promql/active_queries.go: simplify code a bit by inlining getNextActiveQueryID function 2020-07-09 11:18:30 +03:00
Aliaksandr Valialkin
b66c7c13ac docs: add a link to the The CMS monitoring infrastructure and applications publication from CERN 2020-07-08 20:16:43 +03:00
Aliaksandr Valialkin
3e1d7d8489 lib/promscrape: send Accept header similar to Prometheus when scraping targets
This should fix scraping Spring Boot servers, which return incorrect response
unless `Accept: text/plain` request header is set.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/608
2020-07-08 19:48:22 +03:00
Aliaksandr Valialkin
47c7ea5c60 vendor: make vendor-update 2020-07-08 19:25:38 +03:00
Aliaksandr Valialkin
4f737d1cbd docs/Cluster-VictoriaMetrics.md: mention about api/v1/status/active_queries page
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/528
2020-07-08 19:18:26 +03:00
Aliaksandr Valialkin
742da690f4 app/vmselect: add /api/v1/status/active_queries page with the list of currently running queries
This is a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/598

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/575
2020-07-08 18:55:38 +03:00
DexterZhang
99f54e44ff feat(vmselect): add current running query list, add ability for getting the running query info and killing running query for master branch (#598) 2020-07-08 18:52:55 +03:00
Aliaksandr Valialkin
cb92113632 lib/storage: limit the maximum concurrency for data ingestion to GOMAXPROCS
Previously the concurrency has been limited to GOMAXPROCS*2. This had little sense,
since every call to Storage.AddRows is bound to CPU, so the maximum ingestion bandwidth
is achieved when the number of concurrent calls to Storage.AddRows is limited to the number of CPUs,
i.e. to GOMAXPROCS.
2020-07-08 17:32:18 +03:00
Roman Khavronenko
e7557e0252 lib/protoparser: fix metric name of unmarshal errors in promremotewrite (#607)
The change fixes the typo in metric name `vm_protoparser_unmarshal_errors` to
respect the naming standard.
2020-07-08 14:18:41 +03:00
Aliaksandr Valialkin
e59b9916aa lib/protoparser/graphite: go fmt 2020-07-08 14:12:10 +03:00
Aliaksandr Valialkin
d0b694c5c8 lib/protoparser/graphite: add more tests after eb45185eef
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/610
2020-07-08 14:10:35 +03:00
Seva Poliakov
eb45185eef Fix graphite minus one timestamp (#609)
* fix graphite -1 timestamp

* format the graphite fix -1 timestamp
2020-07-08 13:59:19 +03:00
Aliaksandr Valialkin
32b9fb58b8 lib/storage: clarify out of retention period error message by mentioning -retentionPeriod command-line flag 2020-07-08 13:54:26 +03:00
Aliaksandr Valialkin
12b16077c4 lib/storage: reset MetricName->TSID cache after deleting time series
This should prevent from adding new data points to deleted time series
without the need to check for the deleted time series.

This improves ingestion performance a bit when the `deleted time series ids` aka `dmis` set
contains big number of time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/596

Based on the idea from @n4mine at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/604
2020-07-06 22:01:08 +03:00
Aliaksandr Valialkin
a23806f486 lib/fs: clarify description for -fs.disableMmap command-line flag 2020-07-06 14:28:34 +03:00
Aliaksandr Valialkin
6daa5f7500 lib/storage: prioritize data ingestion over heavy queries
Heavy queries could result in the lack of CPU resources for processing the current data ingestion stream.
Prevent this by delaying queries' execution until free resources are available for data ingestion.

Expose `vm_search_delays_total` metric, which may be used in for alerting when there is no enough CPU resources
for data ingestion and/or for executing heavy queries.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2020-07-05 19:42:05 +03:00
Roman Khavronenko
703def4b2e app/vmalert: add retries to remotewrite (#605)
* app/vmalert: add retries to remotewrite

Remotewrite pkg now does limited number of retries if write request failed.
This suppose to make vmalert state persisting more reliable.

New metrics were added to remotewrite in order to track rows/bytes sent/dropped.

defaultFlushInterval was increased from 1s to 5s for sanity reasons.

* fix

* wip

* wip

* wip

* fix bits alignment bug for 32-bit systems

* fix mistakenly dropped field
2020-07-05 18:46:52 +03:00
Aliaksandr Valialkin
de137aef98 app/victoria-metrics: fix tests after the commit acf828a759 2020-07-05 18:24:41 +03:00
Aliaksandr Valialkin
acf828a759 app/vmselect/prometheus: small fixes on top of 8bb762124a 2020-07-05 18:17:06 +03:00
faceair
8bb762124a fix adjust last points avoid influence earlier value (#606) 2020-07-05 17:56:54 +03:00
Aliaksandr Valialkin
ff6a0955eb lib/promscrape: use HostClient.DoDeadline instead of HostClient.Do in order to guarantee strict deadline across multiple scrape attempts 2020-07-03 21:33:22 +03:00
Aliaksandr Valialkin
8b133e40d5 lib/promscrape: prevent from too big deadline misses on scrape retries
The maximum deadline miss duration is reduced to 2x scrape_interval in the worst case.
By default it is limited to scrape_interval configured for the given scrape target.
2020-07-03 20:41:36 +03:00
Aliaksandr Valialkin
44a54b8b3d lib/promscrape: check for nil error before checking for the returned status code when scraping targets 2020-07-03 18:37:14 +03:00
Ween
d59cdbe90c [VMAlert] Fix error log when remoteWrite queue size is full (#602)
* Fix Auto metrics relabeled errors

* Finalize auto-genenated  Labels

* Fix Test Errors

* fix error logs when queue is full

Co-authored-by: xinyulong <xinyulong@kuaishou.com>
2020-07-03 16:49:37 +03:00
Aliaksandr Valialkin
0b2086b7a5 app/vminsert: prevent from adding and/or selecting labels with empty values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/600
2020-07-02 23:14:11 +03:00
Aliaksandr Valialkin
8f628cd805 app/victoria-metrics: removed debug log message when -selfScrapeInterval is set 2020-07-02 20:39:41 +03:00
Aliaksandr Valialkin
91b3482894 app/vminsert: add ability to apply relabeling to all the incoming metrics if -relabelConfig command-line arg points to a file with a list of relabel_config entries
See https://victoriametrics.github.io/#relabeling
2020-07-02 20:39:28 +03:00
Aliaksandr Valialkin
e5500bfcf2 all: typo fix: exptected -> expected 2020-07-02 18:05:52 +03:00
Aliaksandr Valialkin
5d3db3ff7c app/vmselect: add interpolate function for filling gaps with linearly interpolated values
See https://stackoverflow.com/q/62565021/274937 for details
2020-07-02 14:54:21 +03:00
Aliaksandr Valialkin
4dd3de9286 lib/promscrape: add ability to set disable_compression and disable_keepalive options in scrape_config section of the config passed to -promscrape.config
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-07-02 14:19:14 +03:00
Aliaksandr Valialkin
8da3f773ae lib/promscrape: add -promscrape.disableKeepAlive command-line flag for disabling http keep-alive connections when scraping targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-07-01 02:20:20 +03:00
BigFish
9d5f5b6878 fix: spelling mistakes (#594)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-07-01 01:35:26 +03:00
Aliaksandr Valialkin
9a2ba5b6d1 vendor: make vendor-update 2020-07-01 01:04:58 +03:00
Aliaksandr Valialkin
b277ba8121 lib/httpserver: add Unwrap method to ErrorWithStatusCode, so As and Is functions in standard errors package may properly unwrap the error inside ErrorWithStatusCode 2020-07-01 00:54:01 +03:00
Aliaksandr Valialkin
84a37098ed app/vmstorage: add -denyQueriesOutsideRetention command-line flag for denying queries outside the configured retention
VictoriaMetrics returns `503 Service Unavailable` http error for requests with time ranges outside the configured retention
if `-denyQueriesOutsideRetention` command-line flag is set.
2020-07-01 00:21:44 +03:00
Aliaksandr Valialkin
56ccfa5218 all: use errors.As instead of type assertion for detecting net.Error 2020-07-01 00:15:34 +03:00
Aliaksandr Valialkin
7c2c8b2981 all: use errors.As for inspecting errors that implement httpserver.ErrorWithStatusCode 2020-07-01 00:04:34 +03:00
Aliaksandr Valialkin
d5dddb0953 all: use %w instead of %s for wrapping errors in fmt.Errorf
This will simplify examining the returned errors such as httpserver.ErrorWithStatusCode .
See https://blog.golang.org/go1.13-errors for details.
2020-06-30 23:05:11 +03:00
Aliaksandr Valialkin
586c5be404 lib/promscrape: add missing label sorting for autogenerated metrics
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/592
2020-06-29 22:36:12 +03:00
Ween
1cd01b5359 Fix Auto metrics relabeled errors (#593)
* Fix Auto metrics relabeled errors

* Finalize auto-genenated  Labels

* Fix Test Errors

Co-authored-by: xinyulong <xinyulong@kuaishou.com>
2020-06-29 22:29:29 +03:00
Roman Khavronenko
88538df267 app/vmalert: support multiple notifier urls (#584) (#590)
* app/vmalert: support multiple notifier urls (#584)

User now can set multiple notifier URLs in the same fashion
as for other vmutils (e.g. vmagent). The same is correct for
TLS setting for every configured URL. Alerts sending is done
in sequential way for respecting the specified URLs order.

* app/vmalert: add basicAuth support for notifier client (#585)

The change adds possibility to set basicAuth creds for notifier
client in the same fasion as for remote write/read and datasource.
2020-06-29 22:21:03 +03:00
Aliaksandr Valialkin
63e5ee0d29 docs: sync with upstream 2020-06-29 22:09:03 +03:00
Roman Khavronenko
eba4e92994 deployment/docker: replace Prometheus with vmagent (#589)
vmagent replaces Prometheus to perform scrapes and writes
into VictoriaMetrics installation. Prometheus datasource was
dropped, but its config was reused to feed vmagent.

Change also contains simplification in dashboard propagation
to Grafana container by removing excessive json manipulation
steps.
2020-06-29 22:05:34 +03:00
Roman Khavronenko
82ecfa3b32 app/vmalert: move flags description and initialization into subpackages
The change adds no new functionality and aims to move flags definitions
to subpackages that are using them. This should improve readability
of the main function.
2020-06-28 12:26:22 +01:00
kreedom
dc4e3f0e0b app/vmalert: properly set transport for HTTP clients
Fixes issue #586
2020-06-27 08:31:54 +01:00
Aliaksandr Valialkin
8f2e88234f docs: update the info that docker images are built on top of alpine image now
A follow-up after the commit ff624c9125
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/522
2020-06-26 13:54:10 +03:00
1411 changed files with 183063 additions and 20158 deletions

View File

@@ -9,9 +9,12 @@ assignees: ''
**Describe the bug**
A clear and concise description of what the bug is.
It would be great [upgrading](https://victoriametrics.github.io/#how-to-upgrade) to [the latest avaialble release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
and verifying whether the bug is reproducible there.
It is also recommended reading [troubleshooting docs](https://victoriametrics.github.io/#troubleshooting).
**To Reproduce**
Steps to reproduce the behavior
Steps to reproduce the behavior.
**Expected behavior**
A clear and concise description of what you expected to happen.

View File

@@ -14,17 +14,15 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@master
uses: actions/setup-go@main
with:
go-version: 1.14
go-version: 1.15
id: go
- name: Dependencies
env:
GO111MODULE: on
run: |
go get -u golang.org/x/lint/golint
go get -u github.com/kisielk/errcheck
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.27.0
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.29.0
- name: Code checkout
uses: actions/checkout@master
- name: Build
@@ -43,7 +41,23 @@ jobs:
make victoria-metrics-arm64
make vmutils
GOOS=freebsd go build -mod=vendor ./app/victoria-metrics
GOOS=freebsd go build -mod=vendor ./app/vmagent
GOOS=freebsd go build -mod=vendor ./app/vmalert
GOOS=freebsd go build -mod=vendor ./app/vmbackup
GOOS=freebsd go build -mod=vendor ./app/vmrestore
GOOS=freebsd go build -mod=vendor ./app/vmctl
GOOS=openbsd go build -mod=vendor ./app/victoria-metrics
GOOS=openbsd go build -mod=vendor ./app/vmagent
GOOS=openbsd go build -mod=vendor ./app/vmalert
GOOS=openbsd go build -mod=vendor ./app/vmbackup
GOOS=openbsd go build -mod=vendor ./app/vmrestore
GOOS=openbsd go build -mod=vendor ./app/vmctl
GOOS=darwin go build -mod=vendor ./app/victoria-metrics
GOOS=darwin go build -mod=vendor ./app/vmagent
GOOS=darwin go build -mod=vendor ./app/vmalert
GOOS=darwin go build -mod=vendor ./app/vmbackup
GOOS=darwin go build -mod=vendor ./app/vmrestore
GOOS=darwin go build -mod=vendor ./app/vmctl
- name: Publish coverage
uses: codecov/codecov-action@v1.0.6
with:

120
CODE_OF_CONDUCT_RU.md Normal file
View File

@@ -0,0 +1,120 @@
# Кодекс Поведения участника
## Наши обязательства
Мы, как участники, авторы и лидеры обязуемся сделать участие в сообществе
свободным от притеснений для всех, независимо от возраста, телосложения,
видимых или невидимых ограничений способности, этнической принадлежности,
половых признаков, гендерной идентичности и выражения, уровня опыта,
образования, социо-экономического статуса, национальности, внешности,
расы, религии, или сексуальной идентичности и ориентации.
Мы обещаем действовать и взаимодействовать таким образом, чтобы вносить вклад в открытое,
дружелюбное, многообразное, инклюзивное и здоровое сообщество.
## Наши стандарты
Примеры поведения, создающие условия для благоприятных взаимоотношений включают в себя:
* Проявление доброты и эмпатии к другим участникам проекта
* Уважение к чужой точке зрения и опыту
* Конструктивная критика и принятие конструктивной критики
* Принятие ответственности, принесение извинений тем, кто пострадал от наших ошибок
и извлечение уроков из опыта
* Ориентирование на то, что лучше подходит для сообщества, а не только для нас лично
Примеры неприемлемого поведения участников включают в себя:
* Использование выражений или изображений сексуального характера и нежелательное сексуальное внимание или домогательство в любой форме
* Троллинг, оскорбительные или уничижительные комментарии, переход на личности или затрагивание политических убеждений
* Публичное или приватное домогательство
* Публикация личной информации других лиц, например, физического или электронного адреса, без явного разрешения
* Иное поведение, которое обоснованно считать неуместным в профессиональной обстановке
## Обязанности
Лидеры сообщества отвечают за разъяснение и применение наших стандартов приемлемого
поведения и будут предпринимать соответствующие и честные меры по исправлению положения
в ответ на любое поведение, которое они сочтут неприемлемым, угрожающим, оскорбительным или вредным.
Лидеры сообщества обладают правом и обязанностью удалять, редактировать или отклонять
комментарии, коммиты, код, изменения в вики, вопросы и другой вклад, который не совпадает
с Кодексом Поведения, и предоставят причины принятого решения, когда сочтут нужным.
## Область применения
Данный Кодекс Поведения применим во всех во всех публичных физических и цифровых пространства сообщества,
а также когда человек официально представляет сообщество в публичных местах.
Примеры представления проекта или сообщества включают использование официальной электронной почты,
публикации в официальном аккаунте в социальных сетях,
или упоминания как представителя в онлайн или оффлайн мероприятии.
## Приведение в исполнение
О случаях домогательства, а так же оскорбительного или иного другого неприемлемого
поведения можно сообщить ответственным лидерам сообщества с помощью письма на info@victoriametrics.com
Все жалобы будут рассмотрены и расследованы оперативно и беспристрастно.
Все лидеры сообщества обязаны уважать неприкосновенность частной жизни и личную
неприкосновенность автора сообщения.
## Руководство по исполнению
Лидеры сообщества будут следовать следующим Принципам Воздействия в Сообществе,
чтобы определить последствия для тех, кого они считают виновными в нарушении данного Кодекса Поведения:
### 1. Исправление
**Общественное влияние**: Использование недопустимой лексики или другое поведение,
считающиеся непрофессиональным или нежелательным в сообществе.
**Последствия**: Личное, письменное предупреждение от лидеров сообщества,
объясняющее суть нарушения и почему такое поведение
было неуместно. Лидеры сообщества могут попросить принести публичное извинение.
### 2. Предупреждение
**Общественное влияние**: Нарушение в результате одного инцидента или серии действий.
**Последствия**: Предупреждение о последствиях в случае продолжающегося неуместного поведения.
На определенное время не допускается взаимодействие с людьми, вовлеченными в инцидент,
включая незапрошенное взаимодействие
с теми, кто обеспечивает соблюдение Кодекса. Это включает в себя избегание взаимодействия
в публичных пространствах, а так же во внешних каналах,
таких как социальные сети. Нарушение этих правил влечет за собой временный или вечный бан.
### 3. Временный бан
**Общественное влияние**: Серьёзное нарушение стандартов сообщества,
включая продолжительное неуместное поведение.
**Последствия**: Временный запрет (бан) на любое взаимодействие
или публичное общение с сообществом на определенный период времени.
На этот период не допускается публичное или личное взаимодействие с людьми,
вовлеченными в инцидент, включая незапрошенное взаимодействие
с теми, кто обеспечивает соблюдение Кодекса.
Нарушение этих правил влечет за собой вечный бан.
### 4. Вечный бан
**Общественное влияние**: Демонстрация систематических нарушений стандартов сообщества,
включая продолжающееся неуместное поведение, домогательство до отдельных лиц,
или проявление агрессии либо пренебрежительного отношения к категориям лиц.
**Последствия**: Вечный запрет на любое публичное взаимодействие с сообществом.
## Атрибуция
Данный Кодекс Поведения основан на [Кодекс Поведения участника][homepage],
версии 2.0, доступной по адресу
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
Принципы Воздействия в Сообществе были вдохновлены [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
[homepage]: https://www.contributor-covenant.org
Ответы на общие вопросы о данном кодексе поведения ищите на странице FAQ:
https://www.contributor-covenant.org/faq. Переводы доступны по адресу
https://www.contributor-covenant.org/translations.

View File

@@ -175,7 +175,7 @@
END OF TERMS AND CONDITIONS
Copyright 2019-2020 VictoriaMetrics, Inc.
Copyright 2019-2021 VictoriaMetrics, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

102
Makefile
View File

@@ -10,13 +10,16 @@ endif
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)'
.PHONY: $(MAKECMDGOALS)
all: \
victoria-metrics-prod \
vmagent-prod \
vmalert-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod
vmrestore-prod \
vmctl-prod
include app/*/Makefile
include deployment/*/Makefile
@@ -30,7 +33,8 @@ publish: \
publish-vmalert \
publish-vmauth \
publish-vmbackup \
publish-vmrestore
publish-vmrestore \
publish-vmctl
package: \
package-victoria-metrics \
@@ -38,31 +42,84 @@ package: \
package-vmalert \
package-vmauth \
package-vmbackup \
package-vmrestore
package-vmrestore \
package-vmctl
vmutils: \
vmagent \
vmalert \
vmauth \
vmbackup \
vmrestore
vmrestore \
vmctl
vmutils-arm64: \
vmagent-arm64 \
vmalert-arm64 \
vmauth-arm64 \
vmbackup-arm64 \
vmrestore-arm64 \
vmctl-arm64
release-snap:
snapcraft
snapcraft upload "victoriametrics_$(PKG_TAG)_multi.snap" --release beta,edge,candidate
release: \
release-victoria-metrics \
release-vmutils
release-victoria-metrics: victoria-metrics-prod
cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz victoria-metrics-prod && \
sha256sum victoria-metrics-$(PKG_TAG).tar.gz > victoria-metrics-$(PKG_TAG)_checksums.txt
release-victoria-metrics: \
release-victoria-metrics-amd64 \
release-victoria-metrics-arm64
release-victoria-metrics-amd64:
GOARCH=amd64 $(MAKE) release-victoria-metrics-generic
release-victoria-metrics-arm64:
GOARCH=arm64 $(MAKE) release-victoria-metrics-generic
release-victoria-metrics-generic: victoria-metrics-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOARCH)||" -czf victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
victoria-metrics-$(GOARCH)-prod \
&& sha256sum victoria-metrics-$(GOARCH)-$(PKG_TAG).tar.gz \
victoria-metrics-$(GOARCH)-prod \
| sed s/-$(GOARCH)// > victoria-metrics-$(GOARCH)-$(PKG_TAG)_checksums.txt
release-vmutils: \
vmagent-prod \
vmalert-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmalert-prod vmauth-prod vmbackup-prod vmrestore-prod && \
sha256sum vmutils-$(PKG_TAG).tar.gz > vmutils-$(PKG_TAG)_checksums.txt
release-vmutils-amd64 \
release-vmutils-arm64
release-vmutils-amd64:
GOARCH=amd64 $(MAKE) release-vmutils-generic
release-vmutils-arm64:
GOARCH=arm64 $(MAKE) release-vmutils-generic
release-vmutils-generic: \
vmagent-$(GOARCH)-prod \
vmalert-$(GOARCH)-prod \
vmauth-$(GOARCH)-prod \
vmbackup-$(GOARCH)-prod \
vmrestore-$(GOARCH)-prod \
vmctl-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOARCH)||" -czf vmutils-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOARCH)-prod \
vmalert-$(GOARCH)-prod \
vmauth-$(GOARCH)-prod \
vmbackup-$(GOARCH)-prod \
vmrestore-$(GOARCH)-prod \
vmctl-$(GOARCH)-prod \
&& sha256sum vmutils-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOARCH)-prod \
vmalert-$(GOARCH)-prod \
vmauth-$(GOARCH)-prod \
vmbackup-$(GOARCH)-prod \
vmrestore-$(GOARCH)-prod \
vmctl-$(GOARCH)-prod \
| sed s/-$(GOARCH)// > vmutils-$(GOARCH)-$(PKG_TAG)_checksums.txt
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@@ -80,7 +137,7 @@ lint: install-golint
golint app/...
install-golint:
which golint || GO111MODULE=off go get -u golang.org/x/lint/golint
which golint || go install golang.org/x/lint/golint
errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./lib/...
@@ -92,9 +149,10 @@ errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./app/vmauth/...
errcheck -exclude=errcheck_excludes.txt ./app/vmbackup/...
errcheck -exclude=errcheck_excludes.txt ./app/vmrestore/...
errcheck -exclude=errcheck_excludes.txt ./app/vmctl/...
install-errcheck:
which errcheck || GO111MODULE=off go get -u github.com/kisielk/errcheck
which errcheck || go install github.com/kisielk/errcheck
check-all: fmt vet lint errcheck golangci-lint
@@ -122,8 +180,8 @@ benchmark-pure:
GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor -bench=. ./app/...
vendor-update:
GO111MODULE=on go get -u ./lib/...
GO111MODULE=on go get -u ./app/...
GO111MODULE=on go get -u -d ./lib/...
GO111MODULE=on go get -u -d ./app/...
GO111MODULE=on go mod tidy
GO111MODULE=on go mod vendor
@@ -133,18 +191,21 @@ app-local:
app-local-pure:
CGO_ENABLED=0 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-pure$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
app-local-with-goarch:
GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-$(GOARCH)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
quicktemplate-gen: install-qtc
qtc
install-qtc:
which qtc || GO111MODULE=off go get -u github.com/valyala/quicktemplate/qtc
which qtc || go install github.com/valyala/quicktemplate/qtc
golangci-lint: install-golangci-lint
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
install-golangci-lint:
which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.29.0
docs-sync:
cp app/vmagent/README.md docs/vmagent.md
@@ -152,4 +213,5 @@ docs-sync:
cp app/vmauth/README.md docs/vmauth.md
cp app/vmbackup/README.md docs/vmbackup.md
cp app/vmrestore/README.md docs/vmrestore.md
cp app/vmctl/README.md docs/vmctl.md
cp README.md docs/Single-server-VictoriaMetrics.md

1087
README.md

File diff suppressed because it is too large Load Diff

View File

@@ -108,3 +108,10 @@ victoria-metrics-package-deb-rpm-all: \
victoria-metrics-package-deb-arm64 \
victoria-metrics-package-rpm \
victoria-metrics-package-rpm-arm64
### Packaging as snap
victoria-metrics-package-snap:
which snapcraft || snap install snapcraft
which multipass || snap install multipass
snapcraft

View File

@@ -2,19 +2,25 @@ package main
import (
"flag"
"fmt"
"io"
"net/http"
"os"
"path"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
)
@@ -23,18 +29,33 @@ var (
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
"Deduplication is disabled if the -dedup.minScrapeInterval is 0")
dryRun = flag.Bool("dryRun", false, "Whether to check only -promscrape.config and then exit. "+
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
if promscrape.IsDryRun() {
*dryRun = true
}
if *dryRun {
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
logger.Infof("-promscrape.config is ok; exitting with 0 status code")
return
}
logger.Infof("starting VictoriaMetrics at %q...", *httpListenAddr)
startTime := time.Now()
storage.SetMinScrapeIntervalForDeduplication(*minScrapeInterval)
vmstorage.Init()
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
vmselect.Init()
vminsert.Init()
startSelfScraper()
@@ -64,6 +85,18 @@ func main() {
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
fmt.Fprintf(w, "<h2>Single-node VictoriaMetrics.</h2></br>")
fmt.Fprintf(w, "See docs at <a href='https://victoriametrics.github.io/'>https://victoriametrics.github.io/</a></br>")
fmt.Fprintf(w, "Useful endpoints: </br>")
writeAPIHelp(w, [][]string{
{"/targets", "discovered targets list"},
{"/api/v1/targets", "advanced information about discovered targets in JSON format"},
{"/metrics", "available service metrics"},
{"/api/v1/status/tsdb", "tsdb status page"},
})
return true
}
if vminsert.RequestHandler(w, r) {
return true
}
@@ -75,3 +108,21 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
return false
}
func writeAPIHelp(w io.Writer, pathList [][]string) {
pathPrefix := httpserver.GetPathPrefix()
for _, p := range pathList {
p, doc := p[0], p[1]
p = path.Join(pathPrefix, p)
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
}
}
func usage() {
const s = `
victoria-metrics is a time series database and monitoring solution.
See the docs at https://victoriametrics.github.io/
`
flagutil.Usage(s)
}

View File

@@ -20,6 +20,7 @@ import (
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
@@ -57,6 +58,7 @@ var (
type test struct {
Name string `json:"name"`
Data []string `json:"data"`
InsertQuery string `json:"insert_query"`
Query []string `json:"query"`
ResultMetrics []Metric `json:"result_metrics"`
ResultSeries Series `json:"result_series"`
@@ -129,7 +131,7 @@ func setUp() {
storagePath = filepath.Join(os.TempDir(), testStorageSuffix)
processFlags()
logger.Init()
vmstorage.InitWithoutMetrics()
vmstorage.InitWithoutMetrics(promql.ResetRollupResultCacheIfNeeded)
vmselect.Init()
vminsert.Init()
go httpserver.Serve(*httpListenAddr, requestHandler)
@@ -192,7 +194,7 @@ func TestWriteRead(t *testing.T) {
time.Sleep(1 * time.Second)
vmstorage.Stop()
// open storage after stop in write
vmstorage.InitWithoutMetrics()
vmstorage.InitWithoutMetrics(promql.ResetRollupResultCacheIfNeeded)
t.Run("read", testRead)
}
@@ -208,7 +210,7 @@ func testWrite(t *testing.T) {
t.Errorf("error compressing %v %s", r, err)
t.Fail()
}
httpWrite(t, testPromWriteHTTPPath, bytes.NewBuffer(data))
httpWrite(t, testPromWriteHTTPPath, test.InsertQuery, bytes.NewBuffer(data))
}
})
@@ -217,7 +219,7 @@ func testWrite(t *testing.T) {
test := x
t.Run(test.Name, func(t *testing.T) {
t.Parallel()
httpWrite(t, testWriteHTTPPath, bytes.NewBufferString(strings.Join(test.Data, "\n")))
httpWrite(t, testWriteHTTPPath, test.InsertQuery, bytes.NewBufferString(strings.Join(test.Data, "\n")))
})
}
})
@@ -245,7 +247,7 @@ func testWrite(t *testing.T) {
t.Run(test.Name, func(t *testing.T) {
t.Parallel()
logger.Infof("writing %s", test.Data)
httpWrite(t, testOpenTSDBWriteHTTPPath, bytes.NewBufferString(strings.Join(test.Data, "\n")))
httpWrite(t, testOpenTSDBWriteHTTPPath, test.InsertQuery, bytes.NewBufferString(strings.Join(test.Data, "\n")))
})
}
})
@@ -323,10 +325,10 @@ func readIn(readFor string, t *testing.T, insertTime time.Time) []test {
return tt
}
func httpWrite(t *testing.T, address string, r io.Reader) {
func httpWrite(t *testing.T, address, query string, r io.Reader) {
t.Helper()
s := newSuite(t)
resp, err := http.Post(address, "", r)
resp, err := http.Post(address+query, "", r)
s.noError(err)
s.noError(resp.Body.Close())
s.equalInt(resp.StatusCode, 204)
@@ -373,7 +375,7 @@ func checkMetricsResult(got, want []Metric) error {
want = removeIfFoundMetrics(r, want)
}
if len(want) > 0 {
return fmt.Errorf("exptected metrics %+v not found in %+v", want, got)
return fmt.Errorf("expected metrics %+v not found in %+v", want, got)
}
return nil
}

View File

@@ -85,7 +85,6 @@ func selfScraper(scrapeInterval time.Duration) {
mr.Timestamp = currentTimestamp
mr.Value = r.Value
}
logger.Infof("writing %d rows at timestamp %d", len(mrs), currentTimestamp)
vmstorage.AddRows(mrs)
}
}

View File

@@ -11,6 +11,6 @@
"status":"success",
"data":{"resultType":"matrix",
"result":[
{"metric":{"item":"y"},"values":[["{TIME_S-1m}","0.5"],["{TIME_S}","0.5"]]}
{"metric":{"item":"y"},"values":[["{TIME_S-1m}","0.5"]]}
]}}
}

View File

@@ -5,12 +5,12 @@
"empty_label_match 1 {TIME_S-1m}",
"empty_label_match;foo=bar 2 {TIME_S-1m}",
"empty_label_match;foo=baz 3 {TIME_S-1m}"],
"query": ["/api/v1/query_range?query=empty_label_match{foo=~'bar|'}&start={TIME_S}&end={TIME_S}&step=60"],
"query": ["/api/v1/query_range?query=empty_label_match{foo=~'bar|'}&start={TIME_S-1m}&end={TIME_S}&step=60"],
"result_query_range": {
"status":"success",
"data":{"resultType":"matrix",
"result":[
{"metric":{"__name__":"empty_label_match"},"values":[["{TIME_S}","1"]]},
{"metric":{"__name__":"empty_label_match","foo":"bar"},"values":[["{TIME_S}","2"]]}
{"metric":{"__name__":"empty_label_match"},"values":[["{TIME_S-1m}","1"],["{TIME_S}","1"]]},
{"metric":{"__name__":"empty_label_match","foo":"bar"},"values":[["{TIME_S-1m}","2"],["{TIME_S}","2"]]}
]}}
}

View File

@@ -16,6 +16,8 @@
["{TIME_S-120s}","3"],
["{TIME_S-60s}","2"],
["{TIME_S-30s}","1"],
["{TIME_S-20s}","1"]
["{TIME_S-20s}","1"],
["{TIME_S-10s}","1"],
["{TIME_S-0s}","1"]
]}]}}
}

View File

@@ -13,6 +13,6 @@
"data":{"resultType":"matrix",
"result":[
{"metric":{"__name__":"not_nan_as_missing_data","item":"x"},"values":[["{TIME_S-2m}","2"]]},
{"metric":{"__name__":"not_nan_as_missing_data","item":"y"},"values":[["{TIME_S-2m}","4"],["{TIME_S-1m}","3"],["{TIME_S}","3"]]}
{"metric":{"__name__":"not_nan_as_missing_data","item":"y"},"values":[["{TIME_S-2m}","4"],["{TIME_S-1m}","3"]]}
]}}
}

View File

@@ -9,6 +9,6 @@
"query": ["/api/v1/query?query=min%20by%20(item)%20(min_over_time(forms_daily_count[10m:1m]))&time={TIME_S-1m}"],
"result_query": {
"status":"success",
"data":{"resultType":"vector","result":[{"metric":{"item":"x"},"value":["{TIME_S-1m}","1"]},{"metric":{"item":"y"},"value":["{TIME_S-1m}","3"]}]}
"data":{"resultType":"vector","result":[{"metric":{"item":"x"},"value":["{TIME_S-1m}","2"]},{"metric":{"item":"y"},"value":["{TIME_S-1m}","4"]}]}
}
}

View File

@@ -0,0 +1,10 @@
{
"name": "insert_with_extra_labels",
"data": ["measurement,tag1=value1,tag2=value2 field6=1.23,field5=123 {TIME_NS}"],
"insert_query": "?extra_label=job=test&extra_label=tag2=value10",
"query": ["/api/v1/export?match={__name__!=''}"],
"result_metrics": [
{"metric":{"__name__":"measurement_field5","tag1":"value1","job": "test","tag2":"value10"},"values":[123], "timestamps": ["{TIME_MS}"]},
{"metric":{"__name__":"measurement_field6","tag1":"value1","job": "test","tag2":"value10"},"values":[1.23], "timestamps": ["{TIME_MS}"]}
]
}

View File

@@ -0,0 +1,9 @@
{
"name": "insert_with_extra_labels",
"data": ["{\"metric\": \"opentsdbhttp.foobar\", \"value\": 1001, \"timestamp\": {TIME_S}, \"tags\": {\"bar\":\"baz\", \"x\": \"y\"}}"],
"insert_query": "?extra_label=job=open-test&extra_label=x=z",
"query": ["/api/v1/export?match={__name__!=''}"],
"result_metrics": [
{"metric":{"__name__":"opentsdbhttp.foobar","bar":"baz","x":"z","job": "open-test"},"values":[1001], "timestamps": ["{TIME_MSZ}"]}
]
}

View File

@@ -0,0 +1,9 @@
{
"name": "basic_insertion_with_extra_labels",
"insert_query": "?extra_label=job=prom-test&extra_label=baz=bar",
"data": ["[{\"labels\":[{\"name\":\"__name__\",\"value\":\"prometheus.foobar\"},{\"name\":\"baz\",\"value\":\"qux\"}],\"samples\":[{\"value\":100000,\"timestamp\":\"{TIME_MS}\"}]}]"],
"query": ["/api/v1/export?match={__name__!=''}"],
"result_metrics": [
{"metric":{"__name__":"prometheus.foobar","baz":"bar","job": "prom-test"},"values":[100000], "timestamps": ["{TIME_MS}"]}
]
}

View File

@@ -59,19 +59,22 @@ run-vmagent:
$(MAKE) run-via-docker
vmagent-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-amd64 ./app/vmagent
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmagent-local-with-goarch
vmagent-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-arm ./app/vmagent
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmagent-local-with-goarch
vmagent-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-arm64 ./app/vmagent
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmagent-local-with-goarch
vmagent-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-ppc64le ./app/vmagent
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmagent-local-with-goarch
vmagent-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-386 ./app/vmagent
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmagent-local-with-goarch
vmagent-local-with-goarch:
APP_NAME=vmagent $(MAKE) app-local-with-goarch
vmagent-pure:
APP_NAME=vmagent $(MAKE) app-local-pure

View File

@@ -7,7 +7,7 @@ or any other Prometheus-compatible storage system that supports the `remote_writ
<img alt="vmagent" src="vmagent.png">
### Motivation
## Motivation
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
@@ -15,18 +15,20 @@ Also, we found that users infrastructure are snowflakes - no two are alike, a
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
### Features
## Features
* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
See [Quick Start](#quick-start) for details.
* Can add, remove and modify labels (aka tags) via Prometheus relabeling. Can filter data before sending it to remote storage. See [these docs](#relabeling) for details.
* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-opentsdb-compatible-agents).
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data).
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-csv-data).
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
* Data in Prometheus exposition format. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
* Can replicate collected metrics simultaneously to multiple remote storage systems.
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
@@ -34,13 +36,13 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
### Quick Start
## Quick Start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary number of remote storage systems.
Example command line:
@@ -54,17 +56,33 @@ If you only need to collect Influx data, then the following is sufficient:
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
### Use cases
## Configuration update
`vmagent` should be restarted in order to update config options set via command-line args.
`vmagent` supports multiple approaches for reloading configs from updated config files such as `-promscrape.config`, `-remoteWrite.relabelConfig` and `-remoteWrite.urlRelabelConfig`:
* Sending `SUGHUP` signal to `vmagent` process:
```bash
kill -SIGHUP `pidof vmagent`
```
* Sending HTTP request to `http://vmagent:8429/-/reload` endpoint.
There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file.
#### IoT and Edge monitoring
## Use cases
### IoT and Edge monitoring
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to the remote storage.
It buffers the collected data in local files until the connection to remote storage becomes available and then sends the buffered
@@ -75,14 +93,14 @@ The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
#### Drop-in replacement for Prometheus
### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
#### Replication and high availability
### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
@@ -90,14 +108,14 @@ If a single remote storage instance temporarily is out of service, then the coll
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
#### Relabeling and filtering
### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
#### Splitting data streams among multiple systems
### Splitting data streams among multiple systems
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
@@ -105,7 +123,7 @@ data among long-term remote storage, short-term remote storage and real-time ana
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
#### Prometheus remote_write proxy
### Prometheus remote_write proxy
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
@@ -113,8 +131,12 @@ The `vmagent` can be configured to encrypt the incoming `remote_write` requests
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
### remote_write for clustered version
### How to collect metrics in Prometheus format
Despite `vmagent` can accept data in several supported protocols (OpenTSDB, Influx, Prometheus, Graphite) and scrape data from various targets, writes always peformed in Promethes remote_write protocol. Therefore for clustered version `-remoteWrite.url` command-line flag should be configured as `<schema>://<vminsert-host>:8480/insert/<customer-id>/prometheus/api/v1/write`
## How to collect metrics in Prometheus format
Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
@@ -134,7 +156,7 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
`vmagent` doesn't support `role_arn` config param yet.
`vmagent` doesn't support `profile` config param and aws credentials file yet.
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
`vmagent` provides the following additional functionality for `gce_sd_config`:
@@ -146,16 +168,31 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
* `openstack_sd_configs` - for scraping OpenStack targets.
See [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) for details.
[OpenStack identity API v3](https://docs.openstack.org/api-ref/identity/v3/) is supported only.
* `dockerswarm_sd_configs` - for scraping Docker Swarm targets.
See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details.
* `eureka_sd_configs` - for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
`vmagent` also support the following additional options in `scrape_config` section:
* `disable_compression: true` - for disabling response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
in order to save network bandwidth.
* `disable_keepalive: true` - for disabling [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
By default `vmagent` uses keep-alive connections to scrape targets in order to reduce overhead on connection re-establishing.
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
### Adding labels to metrics
## Adding labels to metrics
Labels can be added to metrics via the following mechanisms:
@@ -163,7 +200,7 @@ Labels can be added to metrics via the following mechanisms:
* Via `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`.
### Relabeling
## Relabeling
`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
Additionally it provides the following extra actions:
@@ -182,6 +219,7 @@ The relabeling can be defined in the following places:
Read more about relabeling in the following articles:
* [How to use Relabeling in Prometheus and VictoriaMetrics](https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2)
* [Life of a label](https://www.robustperception.io/life-of-a-label)
* [Discarding targets and timeseries with relabeling](https://www.robustperception.io/relabelling-can-discard-targets-timeseries-and-alerts)
* [Dropping labels at scrape time](https://www.robustperception.io/dropping-metrics-at-scrape-time-with-prometheus)
@@ -189,70 +227,149 @@ Read more about relabeling in the following articles:
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
### Monitoring
## Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
If you have suggestions, improvements or found a bug - feel free to open an issue on github or add review to the dashboard.
`vmagent` also exports target statuses at `http://vmagent-host:8429/targets` page in plaintext format.
`vmagent` also exports target statuses at the following handlers:
* `http://vmagent-host:8429/targets`. This handler returns human-readable plaintext status for every active target.
This page is convenient to query from command line with `wget`, `curl` or similar tools.
It accepts optional `show_original_labels=1` query arg, which shows the original labels per each target before applying relabeling.
This information may be useful for debugging target relabeling.
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes initialization for all service_discovery configs.
It may be useful for performing `vmagent` rolling update without scrape loss.
### Troubleshooting
## Troubleshooting
* It is recommended [setting up the official Grafana dashboard](#monitoring) in order to monitor `vmagent` state.
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
since `vmagent` establishes at least a single TCP connection per each target.
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
and `http://vmagent-host:8429/api/v1/targets`.
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default up to `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value in order to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM, so big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if big number of scrape targets are dropped during relabeling.
* If `vmagent` scrapes big number of targets, then `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
* If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
```yml
scrape_configs:
- job_name: 'big-federate'
stream_parse: true
static_configs:
- targets:
- big-prometeus1
- big-prometeus2
honor_labels: true
metrics_path: /federate
params:
'match[]': ['{__name__!=""}']
```
Note that `sample_limit` option doesn't work if stream parsing is enabled, since the parsed data is pushed to remote storage as soon as it is parsed. So `sample_limit` option
has no sense during stream parsing.
* It is recommended to increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* If you see gaps on the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, then try increasing `-remoteWrite.queues`.
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage, so it starts dropping the buffered data
if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports.
Just add the following relabeling rule to `relabel_configs` section in order to filter out targets with unneeded ports:
* By default `vmagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because
the url may contain sensitive information such as auth tokens or passwords.
Pass `-remoteWrite.showURL` command-line flag when starting `vmagent` in order to see all the valid urls.
```yml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports
or they use init container. These errors can be either fixed or suppressed with `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
See available options below if you prefer fixing the root cause of the error:
The following `relabel_configs` section may help determining `__meta_*` labels resulting in duplicate targets:
```yml
- action: labelmap
regex: __meta_(.*)
```
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
```yml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
The following relabeling rule may be added to `relabel_configs` section in order to filter out init container pods:
```yml
- action: drop
source_labels: [__meta_kubernetes_pod_container_init]
regex: true
```
### How to build from sources
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in `vmutils-*` archives there.
#### Development build
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent` from the root folder of the repository.
It builds `vmagent` binary and puts it into the `bin` folder.
#### Production build
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-prod` from the root folder of the repository.
It builds `vmagent-prod` binary and puts it into the `bin` folder.
#### Building docker images
### Building docker images
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmagent
ROOT_IMAGE=scratch make package-vmagent
```
### ARM build
### Profiling
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of the repository.
It builds `vmagent-arm` or `vmagent-arm64` binary respectively and puts it into the `bin` folder.
### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of the repository.
It builds `vmagent-arm-prod` or `vmagent-arm64-prod` binary respectively and puts it into the `bin` folder.
## Profiling
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):

View File

@@ -1,10 +1,11 @@
package common
import (
"runtime"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
)
// PushCtx is a context used for populating WriteRequest.
@@ -28,12 +29,7 @@ func (ctx *PushCtx) Reset() {
}
ctx.WriteRequest.Timeseries = ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels
for i := range labels {
label := &labels[i]
label.Name = ""
label.Value = ""
}
promrelabel.CleanLabels(ctx.Labels)
ctx.Labels = ctx.Labels[:0]
ctx.Samples = ctx.Samples[:0]
@@ -67,4 +63,4 @@ func PutPushCtx(ctx *PushCtx) {
}
var pushCtxPool sync.Pool
var pushCtxPoolCh = make(chan *PushCtx, runtime.GOMAXPROCS(-1))
var pushCtxPoolCh = make(chan *PushCtx, cgroup.AvailableCPUs())

View File

@@ -6,6 +6,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
@@ -18,12 +19,18 @@ var (
// InsertHandler processes csv data from req.
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row) error {
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@@ -44,6 +51,7 @@ func insertRows(rows []parser.Row) error {
Value: tag.Value,
})
}
labels = append(labels, extraLabels...)
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,

View File

@@ -4,13 +4,15 @@ import (
"flag"
"io"
"net/http"
"runtime"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
@@ -19,6 +21,7 @@ import (
var (
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field")
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
)
var (
@@ -31,7 +34,9 @@ var (
// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
func InsertHandlerForReader(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, false, "", "", insertRows)
return parser.ParseStream(r, false, "", "", func(db string, rows []parser.Row) error {
return insertRows(db, rows, nil)
})
})
}
@@ -39,17 +44,23 @@ func InsertHandlerForReader(r io.Reader) error {
//
// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
func InsertHandlerForHTTP(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
q := req.URL.Query()
precision := q.Get("precision")
// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
db := q.Get("db")
return parser.ParseStream(req.Body, isGzipped, precision, db, insertRows)
return parser.ParseStream(req.Body, isGzipped, precision, db, func(db string, rows []parser.Row) error {
return insertRows(db, rows, extraLabels)
})
})
}
func insertRows(db string, rows []parser.Row) error {
func insertRows(db string, rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := getPushCtx()
defer putPushCtx(ctx)
@@ -61,25 +72,30 @@ func insertRows(db string, rows []parser.Row) error {
buf := ctx.buf[:0]
for i := range rows {
r := &rows[i]
rowsTotal += len(r.Fields)
commonLabels = commonLabels[:0]
hasDBLabel := false
hasDBKey := false
for j := range r.Tags {
tag := &r.Tags[j]
if tag.Key == "db" {
hasDBLabel = true
hasDBKey = true
}
commonLabels = append(commonLabels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
if len(db) > 0 && !hasDBLabel {
if len(db) > 0 && !hasDBKey {
commonLabels = append(commonLabels, prompbmarshal.Label{
Name: "db",
Value: db,
})
}
ctx.metricGroupBuf = append(ctx.metricGroupBuf[:0], r.Measurement...)
commonLabels = append(commonLabels, extraLabels...)
ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
if !*skipMeasurement {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
}
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
@@ -107,7 +123,6 @@ func insertRows(db string, rows []parser.Row) error {
Samples: samples[len(samples)-1:],
})
}
rowsTotal += len(r.Fields)
}
ctx.buf = buf
ctx.ctx.WriteRequest.Timeseries = tssDst
@@ -131,12 +146,8 @@ type pushCtx struct {
func (ctx *pushCtx) reset() {
ctx.ctx.Reset()
commonLabels := ctx.commonLabels
for i := range commonLabels {
label := &commonLabels[i]
label.Name = ""
label.Value = ""
}
promrelabel.CleanLabels(ctx.commonLabels)
ctx.commonLabels = ctx.commonLabels[:0]
ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
ctx.buf = ctx.buf[:0]
@@ -164,4 +175,4 @@ func putPushCtx(ctx *pushCtx) {
}
var pushCtxPool sync.Pool
var pushCtxPoolCh = make(chan *pushCtx, runtime.GOMAXPROCS(-1))
var pushCtxPoolCh = make(chan *pushCtx, cgroup.AvailableCPUs())

View File

@@ -6,18 +6,22 @@ import (
"net/http"
"os"
"strings"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/prometheusimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
@@ -26,6 +30,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
@@ -34,14 +39,16 @@ var (
httpListenAddr = flag.String("httpListenAddr", ":8429", "TCP address to listen for http connections. "+
"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<vmagent>:8429/write`")
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
"Usually :4242 must be set. Doesn't work if empty")
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . See also -promscrape.config.dryRun")
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . "+
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
)
var (
@@ -56,18 +63,23 @@ func main() {
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
remotewrite.InitSecretFlags()
buildinfo.Init()
logger.Init()
if *dryRun {
if err := flag.Set("promscrape.config.strictParse", "true"); err != nil {
logger.Panicf("BUG: cannot set promscrape.config.strictParse=true: %s", err)
if promscrape.IsDryRun() {
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
logger.Infof("-promscrape.config is ok; exitting with 0 status code")
return
}
if *dryRun {
if err := remotewrite.CheckRelabelConfigs(); err != nil {
logger.Fatalf("error when checking relabel configs: %s", err)
}
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking Prometheus config: %s", err)
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
logger.Infof("all the configs are ok; exitting with 0 status code")
return
@@ -76,6 +88,7 @@ func main() {
logger.Infof("starting vmagent at %q...", *httpListenAddr)
startTime := time.Now()
remotewrite.Init()
common.StartUnmarshalWorkers()
writeconcurrencylimiter.Init()
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
@@ -123,19 +136,24 @@ func main() {
if len(*opentsdbHTTPListenAddr) > 0 {
opentsdbhttpServer.MustStop()
}
common.StopUnmarshalWorkers()
remotewrite.Stop()
logger.Infof("successfully stopped vmagent in %.3f seconds", time.Since(startTime).Seconds())
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
fmt.Fprintf(w, "vmagent - see docs at https://victoriametrics.github.io/vmagent.html")
return true
}
path := strings.Replace(r.URL.Path, "//", "/", -1)
switch path {
case "/api/v1/write":
prometheusWriteRequests.Inc()
if err := promremotewrite.InsertHandler(r); err != nil {
prometheusWriteErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -144,7 +162,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
vmimportRequests.Inc()
if err := vmimport.InsertHandler(r); err != nil {
vmimportErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -153,7 +171,25 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/prometheus":
prometheusimportRequests.Inc()
if err := prometheusimport.InsertHandler(r); err != nil {
prometheusimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/native":
nativeimportRequests.Inc()
if err := native.InsertHandler(r); err != nil {
nativeimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -162,7 +198,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
influxWriteRequests.Inc()
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
@@ -175,14 +211,29 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
return true
case "/targets":
promscrapeTargetsRequests.Inc()
w.Header().Set("Content-Type", "text/plain")
promscrape.WriteHumanReadableTargetsStatus(w)
promscrape.WriteHumanReadableTargetsStatus(w, r)
return true
case "/api/v1/targets":
promscrapeAPIV1TargetsRequests.Inc()
w.Header().Set("Content-Type", "application/json; charset=utf-8")
state := r.FormValue("state")
promscrape.WriteAPIV1Targets(w, state)
return true
case "/-/reload":
promscrapeConfigReloadRequests.Inc()
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
case "/ready":
if rdy := atomic.LoadInt32(&promscrape.PendingScrapeConfigs); rdy > 0 {
errMsg := fmt.Sprintf("waiting for scrapes to init, left: %d", rdy)
http.Error(w, errMsg, http.StatusTooEarly)
} else {
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
}
return true
}
return false
}
@@ -197,12 +248,19 @@ var (
csvimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/csv", protocol="csvimport"}`)
csvimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/csv", protocol="csvimport"}`)
prometheusimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/prometheus", protocol="prometheusimport"}`)
prometheusimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/prometheus", protocol="prometheusimport"}`)
nativeimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/native", protocol="nativeimport"}`)
nativeimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/native", protocol="nativeimport"}`)
influxWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/write", protocol="influx"}`)
influxWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/write", protocol="influx"}`)
influxQueryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/query", protocol="influx"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeAPIV1TargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/targets"}`)
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
)
@@ -211,10 +269,7 @@ func usage() {
const s = `
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md .
See the docs at https://victoriametrics.github.io/vmagent.html .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
flagutil.Usage(s)
}

View File

@@ -0,0 +1,85 @@
package native
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/native"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="native"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="native"}`)
)
// InsertHandler processes `/api/v1/import` request.
//
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, func(block *parser.Block) error {
return insertRows(block, extraLabels)
})
})
}
func insertRows(block *parser.Block, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
// Update rowsInserted and rowsPerInsert before actual inserting,
// since relabeling can prevent from inserting the rows.
rowsLen := len(block.Values)
rowsInserted.Add(rowsLen)
rowsPerInsert.Update(float64(rowsLen))
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
mn := &block.MetricName
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: bytesutil.ToUnsafeString(mn.MetricGroup),
})
for j := range mn.Tags {
tag := &mn.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(tag.Key),
Value: bytesutil.ToUnsafeString(tag.Value),
})
}
labels = append(labels, extraLabels...)
values := block.Values
timestamps := block.Timestamps
if len(timestamps) != len(values) {
logger.Panicf("BUG: len(timestamps)=%d must match len(values)=%d", len(timestamps), len(values))
}
samplesLen := len(samples)
for j, value := range values {
samples = append(samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamps[j],
})
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
return nil
}

View File

@@ -6,6 +6,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
@@ -19,12 +20,18 @@ var (
// InsertHandler processes HTTP OpenTSDB put requests.
// See http://opentsdb.net/docs/build/html/api_http/put.html
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row) error {
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@@ -45,6 +52,7 @@ func insertRows(rows []parser.Row) error {
Value: tag.Value,
})
}
labels = append(labels, extraLabels...)
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,

View File

@@ -0,0 +1,76 @@
package prometheusimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="prometheus"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="prometheus"}`)
)
// InsertHandler processes `/api/v1/import/prometheus` request.
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
defaultTimestamp, err := parserCommon.GetTimestamp(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
return parser.ParseStream(req.Body, defaultTimestamp, isGzipped, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
}, nil)
})
}
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
labels = append(labels, extraLabels...)
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -8,6 +8,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
@@ -20,12 +21,18 @@ var (
// InsertHandler processes remote write for prometheus.
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
return parser.ParseStream(req, func(tss []prompb.TimeSeries) error {
return insertRows(tss, extraLabels)
})
})
}
func insertRows(timeseries []prompb.TimeSeries) error {
func insertRows(timeseries []prompb.TimeSeries, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@@ -35,6 +42,7 @@ func insertRows(timeseries []prompb.TimeSeries) error {
samples := ctx.Samples[:0]
for i := range timeseries {
ts := &timeseries[i]
rowsTotal += len(ts.Samples)
labelsLen := len(labels)
for i := range ts.Labels {
label := &ts.Labels[i]
@@ -43,6 +51,7 @@ func insertRows(timeseries []prompb.TimeSeries) error {
Value: bytesutil.ToUnsafeString(label.Value),
})
}
labels = append(labels, extraLabels...)
samplesLen := len(samples)
for i := range ts.Samples {
sample := &ts.Samples[i]
@@ -55,7 +64,6 @@ func insertRows(timeseries []prompb.TimeSeries) error {
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
rowsTotal += len(ts.Samples)
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels

View File

@@ -1,10 +1,13 @@
package remotewrite
import (
"bytes"
"crypto/tls"
"encoding/base64"
"flag"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"sync"
"time"
@@ -13,14 +16,19 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
"github.com/VictoriaMetrics/metrics"
)
var (
sendTimeout = flag.Duration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to -remoteWrite.url")
rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", "Optional rate limit in bytes per second for data sent to -remoteWrite.url. "+
"By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data "+
"is sent after temporary unavailability of the remote storage")
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to -remoteWrite.url")
proxyURL = flagutil.NewArray("remoteWrite.proxyURL", "Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. "+
"Example: -remoteWrite.proxyURL=socks5://proxy:1234")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsInsecureSkipVerify = flagutil.NewArrayBool("remoteWrite.tlsInsecureSkipVerify", "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flagutil.NewArray("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsKeyFile = flagutil.NewArray("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. "+
@@ -39,24 +47,51 @@ var (
)
type client struct {
urlLabelValue string
sanitizedURL string
remoteWriteURL string
host string
requestURI string
authHeader string
fq *persistentqueue.FastQueue
hc *fasthttp.HostClient
hc *http.Client
rl rateLimiter
bytesSent *metrics.Counter
blocksSent *metrics.Counter
requestDuration *metrics.Histogram
requestsOKCount *metrics.Counter
errorsCount *metrics.Counter
packetsDropped *metrics.Counter
retriesCount *metrics.Counter
wg sync.WaitGroup
stopCh chan struct{}
}
func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue, concurrency int) *client {
func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
tlsCfg, err := getTLSConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
}
tr := &http.Transport{
Dial: statDial,
TLSClientConfig: tlsCfg,
TLSHandshakeTimeout: 5 * time.Second,
MaxConnsPerHost: 2 * concurrency,
MaxIdleConnsPerHost: 2 * concurrency,
IdleConnTimeout: time.Minute,
WriteBufferSize: 64 * 1024,
}
pURL := proxyURL.GetOptionalArg(argIdx)
if len(pURL) > 0 {
if !strings.Contains(pURL, "://") {
logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: it must start with `http://`, `https://` or `socks5://`", pURL)
}
urlProxy, err := url.Parse(pURL)
if err != nil {
logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: %s", pURL, err)
}
tr.Proxy = http.ProxyURL(urlProxy)
}
authHeader := ""
username := basicAuthUsername.GetOptionalArg(argIdx)
password := basicAuthPassword.GetOptionalArg(argIdx)
@@ -73,68 +108,30 @@ func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentq
}
authHeader = "Bearer " + token
}
readTimeout := *sendTimeout
if readTimeout <= 0 {
readTimeout = time.Minute
}
writeTimeout := readTimeout
var u fasthttp.URI
u.Update(remoteWriteURL)
scheme := string(u.Scheme())
switch scheme {
case "http", "https":
default:
logger.Fatalf("unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
}
host := string(u.Host())
if len(host) == 0 {
logger.Fatalf("invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
}
requestURI := string(u.RequestURI())
isTLS := scheme == "https"
var tlsCfg *tls.Config
if isTLS {
var err error
tlsCfg, err = getTLSConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
}
}
if !strings.Contains(host, ":") {
if isTLS {
host += ":443"
} else {
host += ":80"
}
}
maxConns := 2 * concurrency
hc := &fasthttp.HostClient{
Addr: host,
Name: "vmagent",
Dial: statDial,
IsTLS: isTLS,
TLSConfig: tlsCfg,
MaxConns: maxConns,
MaxIdleConnDuration: 10 * readTimeout,
ReadTimeout: readTimeout,
WriteTimeout: writeTimeout,
MaxResponseBodySize: 1024 * 1024,
}
c := &client{
urlLabelValue: urlLabelValue,
sanitizedURL: sanitizedURL,
remoteWriteURL: remoteWriteURL,
host: host,
requestURI: requestURI,
authHeader: authHeader,
fq: fq,
hc: hc,
stopCh: make(chan struct{}),
hc: &http.Client{
Transport: tr,
Timeout: sendTimeout.GetOptionalArgOrDefault(argIdx, time.Minute),
},
stopCh: make(chan struct{}),
}
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.urlLabelValue))
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.urlLabelValue))
c.errorsCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.urlLabelValue))
c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.urlLabelValue))
if bytesPerSec := rateLimit.GetOptionalArgOrDefault(argIdx, 0); bytesPerSec > 0 {
logger.Infof("applying %d bytes per second rate limit for -remoteWrite.url=%q", bytesPerSec, sanitizedURL)
c.rl.perSecondLimit = int64(bytesPerSec)
}
c.rl.limitReached = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remote_write_rate_limit_reached_total{url=%q}`, c.sanitizedURL))
c.bytesSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_bytes_sent_total{url=%q}`, c.sanitizedURL))
c.blocksSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_blocks_sent_total{url=%q}`, c.sanitizedURL))
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.sanitizedURL))
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.sanitizedURL))
c.errorsCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.sanitizedURL))
c.packetsDropped = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_packets_dropped_total{url=%q}`, c.sanitizedURL))
c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.sanitizedURL))
for i := 0; i < concurrency; i++ {
c.wg.Add(1)
go func() {
@@ -142,27 +139,30 @@ func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentq
c.runWorker()
}()
}
logger.Infof("initialized client for -remoteWrite.url=%q", c.remoteWriteURL)
logger.Infof("initialized client for -remoteWrite.url=%q", c.sanitizedURL)
return c
}
func (c *client) MustStop() {
close(c.stopCh)
c.wg.Wait()
logger.Infof("stopped client for -remoteWrite.url=%q", c.remoteWriteURL)
logger.Infof("stopped client for -remoteWrite.url=%q", c.sanitizedURL)
}
func getTLSConfig(argIdx int) (*tls.Config, error) {
tlsConfig := &promauth.TLSConfig{
c := &promauth.TLSConfig{
CAFile: tlsCAFile.GetOptionalArg(argIdx),
CertFile: tlsCertFile.GetOptionalArg(argIdx),
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
ServerName: tlsServerName.GetOptionalArg(argIdx),
InsecureSkipVerify: *tlsInsecureSkipVerify,
InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(argIdx),
}
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
if c.CAFile == "" && c.CertFile == "" && c.KeyFile == "" && c.ServerName == "" && !c.InsecureSkipVerify {
return nil, nil
}
cfg, err := promauth.NewConfig(".", nil, "", "", c)
if err != nil {
return nil, fmt.Errorf("cannot populate TLS config: %s", err)
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
}
tlsCfg := cfg.NewTLSConfig()
return tlsCfg, nil
@@ -193,7 +193,7 @@ func (c *client) runWorker() {
// The block has been sent successfully.
case <-time.After(graceDuration):
logger.Errorf("couldn't sent block with size %d bytes to %q in %.3f seconds during shutdown; dropping it",
len(block), c.remoteWriteURL, graceDuration.Seconds())
len(block), c.sanitizedURL, graceDuration.Seconds())
}
return
}
@@ -201,32 +201,28 @@ func (c *client) runWorker() {
}
func (c *client) sendBlock(block []byte) {
req := fasthttp.AcquireRequest()
req.SetRequestURI(c.requestURI)
req.SetHost(c.host)
req.Header.SetMethod("POST")
req.Header.Add("Content-Type", "application/x-protobuf")
req.Header.Add("Content-Encoding", "snappy")
req.Header.Add("X-Prometheus-Remote-Write-Version", "0.1.0")
c.rl.register(len(block), c.stopCh)
retryDuration := time.Second
retriesCount := 0
c.bytesSent.Add(len(block))
c.blocksSent.Inc()
again:
req, err := http.NewRequest("POST", c.remoteWriteURL, bytes.NewBuffer(block))
if err != nil {
logger.Panicf("BUG: unexected error from http.NewRequest(%q): %s", c.sanitizedURL, err)
}
h := req.Header
h.Set("User-Agent", "vmagent")
h.Set("Content-Type", "application/x-protobuf")
h.Set("Content-Encoding", "snappy")
h.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authHeader != "" {
req.Header.Set("Authorization", c.authHeader)
}
req.SetBody(block)
retryDuration := time.Second
resp := fasthttp.AcquireResponse()
again:
select {
case <-c.stopCh:
fasthttp.ReleaseRequest(req)
fasthttp.ReleaseResponse(resp)
return
default:
}
startTime := time.Now()
err := doRequestWithPossibleRetry(c.hc, req, resp)
resp, err := c.hc.Do(req)
c.requestDuration.UpdateDuration(startTime)
if err != nil {
c.errorsCount.Inc()
@@ -235,40 +231,94 @@ again:
retryDuration = time.Minute
}
logger.Errorf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
len(block), c.remoteWriteURL, err, retryDuration.Seconds())
time.Sleep(retryDuration)
c.retriesCount.Inc()
goto again
}
statusCode := resp.StatusCode()
if statusCode/100 != 2 {
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.urlLabelValue, statusCode)).Inc()
retryDuration *= 2
if retryDuration > time.Minute {
retryDuration = time.Minute
len(block), c.sanitizedURL, err, retryDuration.Seconds())
t := timerpool.Get(retryDuration)
select {
case <-c.stopCh:
timerpool.Put(t)
return
case <-t.C:
timerpool.Put(t)
}
logger.Errorf("unexpected status code received after sending a block with size %d bytes to %q: %d; response body=%q; re-sending the block in %.3f seconds",
len(block), c.remoteWriteURL, statusCode, resp.Body(), retryDuration.Seconds())
time.Sleep(retryDuration)
c.retriesCount.Inc()
goto again
}
c.requestsOKCount.Inc()
statusCode := resp.StatusCode
if statusCode/100 == 2 {
_ = resp.Body.Close()
c.requestsOKCount.Inc()
return
}
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
if statusCode == 409 {
// Just drop block on 409 status code like Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
body, _ := ioutil.ReadAll(resp.Body)
_ = resp.Body.Close()
logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+
"response body=%q", len(block), c.sanitizedURL, statusCode, body)
c.packetsDropped.Inc()
return
}
// The block has been successfully sent to the remote storage.
fasthttp.ReleaseResponse(resp)
fasthttp.ReleaseRequest(req)
// Unexpected status code returned
retriesCount++
retryDuration *= 2
if retryDuration > time.Minute {
retryDuration = time.Minute
}
body, err := ioutil.ReadAll(resp.Body)
_ = resp.Body.Close()
if err != nil {
logger.Errorf("cannot read response body from %q during retry #%d: %s", c.sanitizedURL, retriesCount, err)
} else {
logger.Errorf("unexpected status code received after sending a block with size %d bytes to %q during retry #%d: %d; response body=%q; "+
"re-sending the block in %.3f seconds", len(block), c.sanitizedURL, retriesCount, statusCode, body, retryDuration.Seconds())
}
t := timerpool.Get(retryDuration)
select {
case <-c.stopCh:
timerpool.Put(t)
return
case <-t.C:
timerpool.Put(t)
}
c.retriesCount.Inc()
goto again
}
func doRequestWithPossibleRetry(hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response) error {
// There is no need in calling DoTimeout, since the timeout must be already set in hc.ReadTimeout.
err := hc.Do(req, resp)
if err == nil {
return nil
}
if err != fasthttp.ErrConnectionClosed {
return err
}
// Retry request if the server closed the keep-alive connection during the first attempt.
return hc.Do(req, resp)
type rateLimiter struct {
perSecondLimit int64
// The current budget. It is increased by perSecondLimit every second.
budget int64
// The next deadline for increasing the budget by perSecondLimit
deadline time.Time
limitReached *metrics.Counter
}
func (rl *rateLimiter) register(dataLen int, stopCh <-chan struct{}) {
limit := rl.perSecondLimit
if limit <= 0 {
return
}
for rl.budget <= 0 {
now := time.Now()
if d := rl.deadline.Sub(now); d > 0 {
rl.limitReached.Inc()
t := timerpool.Get(d)
select {
case <-stopCh:
timerpool.Put(t)
return
case <-t.C:
timerpool.Put(t)
}
}
rl.budget += limit
rl.deadline = now.Add(time.Second)
}
rl.budget -= int64(dataLen)
}

View File

@@ -3,12 +3,16 @@ package remotewrite
import (
"flag"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/metrics"
"github.com/golang/snappy"
)
@@ -17,7 +21,7 @@ var (
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
"Minimum supported interval is 1 second")
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
maxUnpackedBlockSize = flagutil.NewBytes("remoteWrite.maxBlockSize", 8*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
)
@@ -32,9 +36,11 @@ type pendingSeries struct {
periodicFlusherWG sync.WaitGroup
}
func newPendingSeries(pushBlock func(block []byte)) *pendingSeries {
func newPendingSeries(pushBlock func(block []byte), significantFigures, roundDigits int) *pendingSeries {
var ps pendingSeries
ps.wr.pushBlock = pushBlock
ps.wr.significantFigures = significantFigures
ps.wr.roundDigits = roundDigits
ps.stopCh = make(chan struct{})
ps.periodicFlusherWG.Add(1)
go func() {
@@ -68,7 +74,7 @@ func (ps *pendingSeries) periodicFlusher() {
case <-ps.stopCh:
mustStop = true
case <-ticker.C:
if fasttime.UnixTimestamp()-ps.wr.lastFlushTime < uint64(flushSeconds) {
if fasttime.UnixTimestamp()-atomic.LoadUint64(&ps.wr.lastFlushTime) < uint64(flushSeconds) {
continue
}
}
@@ -79,10 +85,20 @@ func (ps *pendingSeries) periodicFlusher() {
}
type writeRequest struct {
wr prompbmarshal.WriteRequest
pushBlock func(block []byte)
// Move lastFlushTime to the top of the struct in order to guarantee atomic access on 32-bit architectures.
lastFlushTime uint64
// pushBlock is called when whe write request is ready to be sent.
pushBlock func(block []byte)
// How many significant figures must be left before sending the writeRequest to pushBlock.
significantFigures int
// How many decimal digits after point must be left before sending the writeRequest to pushBlock.
roundDigits int
wr prompbmarshal.WriteRequest
tss []prompbmarshal.TimeSeries
labels []prompbmarshal.Label
@@ -91,6 +107,8 @@ type writeRequest struct {
}
func (wr *writeRequest) reset() {
// Do not reset pushBlock, significantFigures and roundDigits, since they are re-used.
wr.wr.Timeseries = nil
for i := range wr.tss {
@@ -100,11 +118,7 @@ func (wr *writeRequest) reset() {
}
wr.tss = wr.tss[:0]
for i := range wr.labels {
label := &wr.labels[i]
label.Name = ""
label.Value = ""
}
promrelabel.CleanLabels(wr.labels)
wr.labels = wr.labels[:0]
wr.samples = wr.samples[:0]
@@ -113,18 +127,35 @@ func (wr *writeRequest) reset() {
func (wr *writeRequest) flush() {
wr.wr.Timeseries = wr.tss
wr.lastFlushTime = fasttime.UnixTimestamp()
wr.adjustSampleValues()
atomic.StoreUint64(&wr.lastFlushTime, fasttime.UnixTimestamp())
pushWriteRequest(&wr.wr, wr.pushBlock)
wr.reset()
}
func (wr *writeRequest) adjustSampleValues() {
samples := wr.samples
if n := wr.significantFigures; n > 0 {
for i := range samples {
s := &samples[i]
s.Value = decimal.RoundToSignificantFigures(s.Value, n)
}
}
if n := wr.roundDigits; n < 100 {
for i := range samples {
s := &samples[i]
s.Value = decimal.RoundToDecimalDigits(s.Value, n)
}
}
}
func (wr *writeRequest) push(src []prompbmarshal.TimeSeries) {
tssDst := wr.tss
for i := range src {
tssDst = append(tssDst, prompbmarshal.TimeSeries{})
dst := &tssDst[len(tssDst)-1]
wr.copyTimeSeries(dst, &src[i])
if len(wr.tss) >= maxRowsPerBlock {
wr.copyTimeSeries(&tssDst[len(tssDst)-1], &src[i])
if len(wr.samples) >= maxRowsPerBlock {
wr.tss = tssDst
wr.flush()
tssDst = wr.tss
}
@@ -164,7 +195,7 @@ func pushWriteRequest(wr *prompbmarshal.WriteRequest, pushBlock func(block []byt
}
bb := writeRequestBufPool.Get()
bb.B = prompbmarshal.MarshalWriteRequest(bb.B[:0], wr)
if len(bb.B) <= *maxUnpackedBlockSize {
if len(bb.B) <= maxUnpackedBlockSize.N {
zb := snappyBufPool.Get()
zb.B = snappy.Encode(zb.B[:cap(zb.B)], bb.B)
writeRequestBufPool.Put(bb)

View File

@@ -16,7 +16,7 @@ var (
unparsedLabelsGlobal = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
"before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details")
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
)
@@ -33,7 +33,7 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
if *relabelConfigPathGlobal != "" {
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
if err != nil {
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
}
rcs.global = global
}
@@ -43,9 +43,13 @@ func loadRelabelConfigs() (*relabelConfigs, error) {
}
rcs.perURL = make([][]promrelabel.ParsedRelabelConfig, len(*remoteWriteURLs))
for i, path := range *relabelConfigPaths {
if len(path) == 0 {
// Skip empty relabel config.
continue
}
prc, err := promrelabel.LoadRelabelConfigs(path)
if err != nil {
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", path, err)
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
}
rcs.perURL[i] = prc
}
@@ -59,9 +63,11 @@ type relabelConfigs struct {
// initLabelsGlobal must be called after parsing command-line flags.
func initLabelsGlobal() {
// Init labelsGlobal
labelsGlobal = nil
for _, s := range *unparsedLabelsGlobal {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
logger.Fatalf("missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
@@ -114,12 +120,7 @@ type relabelCtx struct {
}
func (rctx *relabelCtx) reset() {
labels := rctx.labels
for i := range labels {
label := &labels[i]
label.Name = ""
label.Value = ""
}
promrelabel.CleanLabels(rctx.labels)
rctx.labels = rctx.labels[:0]
}

View File

@@ -6,8 +6,8 @@ import (
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
@@ -22,14 +22,21 @@ var (
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
queues = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
queues = flag.Int("remoteWrite.queues", 4, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
"isn't enough for sending high volume of collected data to remote storage")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensistive auth info")
maxPendingBytesPerURL = flag.Int("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"It is hidden by default, since it can contain sensitive info such as auth key")
maxPendingBytesPerURL = flagutil.NewBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
"Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. "+
"Disk usage is unlimited if the value is set to 0")
significantFigures = flagutil.NewArrayInt("remoteWrite.significantFigures", "The number of significant figures to leave in metric values before writing them "+
"to remote storage. See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. "+
"This option may be used for improving data compression for the stored metrics. See also -remoteWrite.roundDigits")
roundDigits = flagutil.NewArrayInt("remoteWrite.roundDigits", "Round metric values to this number of decimal digits after the point before writing them to remote storage. "+
"Examples: -remoteWrite.roundDigits=2 would round 1.236 to 1.24, while -remoteWrite.roundDigits=-1 would round 126.78 to 130. "+
"By default digits rounding is disabled. Set it to 100 for disabling it for a particular remote storage. "+
"This option may be used for improving data compression for the stored metrics")
)
var rwctxs []*remoteWriteCtx
@@ -37,6 +44,18 @@ var rwctxs []*remoteWriteCtx
// Contains the current relabelConfigs.
var allRelabelConfigs atomic.Value
// maxQueues limits the maximum value for `-remoteWrite.queues`. There is no sense in setting too high value,
// since it may lead to high memory usage due to big number of buffers.
var maxQueues = cgroup.AvailableCPUs() * 4
// InitSecretFlags must be called after flag.Parse and before any logging.
func InitSecretFlags() {
if !*showRemoteWriteURL {
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
flagutil.RegisterSecretFlag("remoteWrite.url")
}
}
// Init initializes remotewrite.
//
// It must be called after flag.Parse().
@@ -44,12 +63,13 @@ var allRelabelConfigs atomic.Value
// Stop must be called for graceful shutdown.
func Init() {
if len(*remoteWriteURLs) == 0 {
logger.Fatalf("at least one `-remoteWrite.url` must be set")
logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set")
}
if !*showRemoteWriteURL {
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
httpserver.RegisterSecretFlag("remoteWrite.url")
if *queues > maxQueues {
*queues = maxQueues
}
if *queues <= 0 {
*queues = 1
}
initLabelsGlobal()
rcs, err := loadRelabelConfigs()
@@ -69,11 +89,11 @@ func Init() {
maxInmemoryBlocks = 2
}
for i, remoteWriteURL := range *remoteWriteURLs {
urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
sanitizedURL := fmt.Sprintf("%d:secret-url", i+1)
if *showRemoteWriteURL {
urlLabelValue = remoteWriteURL
sanitizedURL = fmt.Sprintf("%d:%s", i+1, remoteWriteURL)
}
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, urlLabelValue)
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
rwctxs = append(rwctxs, rwctx)
}
@@ -118,7 +138,7 @@ func Stop() {
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
//
// Note that wr may be modified by Push due to relabeling.
// Note that wr may be modified by Push due to relabeling and rounding.
func Push(wr *prompbmarshal.WriteRequest) {
var rctx *relabelCtx
rcs := allRelabelConfigs.Load().(*relabelConfigs)
@@ -128,11 +148,20 @@ func Push(wr *prompbmarshal.WriteRequest) {
}
tss := wr.Timeseries
for len(tss) > 0 {
// Process big tss in smaller blocks in order to reduce maxmimum memory usage
// Process big tss in smaller blocks in order to reduce the maximum memory usage
samplesCount := 0
i := 0
for i < len(tss) {
samplesCount += len(tss[i].Samples)
i++
if samplesCount > maxRowsPerBlock {
break
}
}
tssBlock := tss
if len(tssBlock) > maxRowsPerBlock {
tssBlock = tss[:maxRowsPerBlock]
tss = tss[maxRowsPerBlock:]
if i < len(tss) {
tssBlock = tss[:i]
tss = tss[i:]
} else {
tss = nil
}
@@ -162,25 +191,25 @@ type remoteWriteCtx struct {
pss []*pendingSeries
pssNextIdx uint64
tss []prompbmarshal.TimeSeries
relabelMetricsDropped *metrics.Counter
}
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, urlLabelValue string) *remoteWriteCtx {
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
h := xxhash.Sum64([]byte(remoteWriteURL))
path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks, *maxPendingBytesPerURL)
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
path := fmt.Sprintf("%s/persistent-queue/%d_%016X", *tmpDataPath, argIdx+1, h)
fq := persistentqueue.MustOpenFastQueue(path, sanitizedURL, maxInmemoryBlocks, maxPendingBytesPerURL.N)
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, path, sanitizedURL), func() float64 {
return float64(fq.GetPendingBytes())
})
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, urlLabelValue), func() float64 {
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, sanitizedURL), func() float64 {
return float64(fq.GetInmemoryQueueLen())
})
c := newClient(argIdx, remoteWriteURL, urlLabelValue, fq, *queues)
c := newClient(argIdx, remoteWriteURL, sanitizedURL, fq, *queues)
sf := significantFigures.GetOptionalArgOrDefault(argIdx, 0)
rd := roundDigits.GetOptionalArgOrDefault(argIdx, 100)
pss := make([]*pendingSeries, *queues)
for i := range pss {
pss[i] = newPendingSeries(fq.MustWriteBlock)
pss[i] = newPendingSeries(fq.MustWriteBlock, sf, rd)
}
return &remoteWriteCtx{
idx: argIdx,
@@ -188,7 +217,7 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int,
c: c,
pss: pss,
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, urlLabelValue)),
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, sanitizedURL)),
}
}
@@ -208,15 +237,17 @@ func (rwctx *remoteWriteCtx) MustStop() {
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
var rctx *relabelCtx
var v *[]prompbmarshal.TimeSeries
rcs := allRelabelConfigs.Load().(*relabelConfigs)
prcs := rcs.perURL[rwctx.idx]
if len(prcs) > 0 {
rctx = getRelabelCtx()
// Make a copy of tss before applying relabeling in order to prevent
// from affecting time series for other remoteWrite.url configs.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467 for details.
rwctx.tss = append(rwctx.tss[:0], tss...)
tss = rwctx.tss
rctx = getRelabelCtx()
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
v = tssRelabelPool.Get().(*[]prompbmarshal.TimeSeries)
tss = append(*v, tss...)
tssLen := len(tss)
tss = rctx.applyRelabeling(tss, nil, prcs)
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
@@ -225,8 +256,15 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
idx := atomic.AddUint64(&rwctx.pssNextIdx, 1) % uint64(len(pss))
pss[idx].Push(tss)
if rctx != nil {
*v = prompbmarshal.ResetTimeSeries(tss)
tssRelabelPool.Put(v)
putRelabelCtx(rctx)
// Zero rwctx.tss in order to free up GC references.
rwctx.tss = prompbmarshal.ResetTimeSeries(rwctx.tss)
}
}
var tssRelabelPool = &sync.Pool{
New: func() interface{} {
a := []prompbmarshal.TimeSeries{}
return &a
},
}

View File

@@ -1,20 +1,24 @@
package remotewrite
import (
"fmt"
"net"
"strings"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
func statDial(addr string) (conn net.Conn, err error) {
if netutil.TCP6Enabled() {
conn, err = fasthttp.DialDualStack(addr)
} else {
conn, err = fasthttp.Dial(addr)
func statDial(network, addr string) (conn net.Conn, err error) {
if !strings.HasPrefix(network, "tcp") {
return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network)
}
if !netutil.TCP6Enabled() {
network = "tcp4"
}
conn, err = net.DialTimeout(network, addr, 5*time.Second)
dialsTotal.Inc()
if err != nil {
dialErrors.Inc()

View File

@@ -6,7 +6,9 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
@@ -21,12 +23,18 @@ var (
//
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row) error {
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
@@ -36,6 +44,7 @@ func insertRows(rows []parser.Row) error {
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
rowsTotal += len(r.Values)
labelsLen := len(labels)
for j := range r.Tags {
tag := &r.Tags[j]
@@ -44,9 +53,12 @@ func insertRows(rows []parser.Row) error {
Value: bytesutil.ToUnsafeString(tag.Value),
})
}
labels = append(labels, extraLabels...)
values := r.Values
timestamps := r.Timestamps
_ = timestamps[len(values)-1]
if len(timestamps) != len(values) {
logger.Panicf("BUG: len(timestamps)=%d must match len(values)=%d", len(timestamps), len(values))
}
samplesLen := len(samples)
for j, value := range values {
samples = append(samples, prompbmarshal.Sample{
@@ -58,7 +70,6 @@ func insertRows(rows []parser.Row) error {
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
rowsTotal += len(values)
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels

View File

@@ -61,24 +61,30 @@ run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 \
-notifier.url=http://127.0.0.1:9093 \
-remoteWrite.url=http://localhost:8428 \
-remoteRead.url=http://localhost:8428 \
-external.label=cluster=east-1 \
-external.label=replica=a \
-evaluationInterval=3s
vmalert-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-amd64 ./app/vmalert
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmalert-local-with-goarch
vmalert-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm ./app/vmalert
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmalert-local-with-goarch
vmalert-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-arm64 ./app/vmalert
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmalert-local-with-goarch
vmalert-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-ppc64le ./app/vmalert
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmalert-local-with-goarch
vmalert-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmalert-386 ./app/vmalert
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmalert-local-with-goarch
vmalert-local-with-goarch:
APP_NAME=vmalert $(MAKE) app-local-with-goarch
vmalert-pure:
APP_NAME=vmalert $(MAKE) app-local-pure

View File

@@ -6,11 +6,13 @@ rules against configured address.
### Features:
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
* VictoriaMetrics [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Keeps the alerts [state on restarts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmalert#alerts-state-on-restarts);
* Graphite datasource can be used for alerting and recording rules. See [these docs](#graphite) for details.
* Lightweight without extra dependencies.
### Limitations:
@@ -20,7 +22,6 @@ may fail;
* by default, rules execution is sequential within one group, but persisting of execution results to remote
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
recording rule is reused in next one;
* there is no `query` function support in templates yet;
* `vmalert` has no UI, just an API for getting groups and rules statuses.
### QuickStart
@@ -44,10 +45,19 @@ compatible storage address for storing recording rules results and alerts state
Then configure `vmalert` accordingly:
```
./bin/vmalert -rule=alert.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
-notifier.url=http://localhost:9093 \ # AlertManager URL
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
-external.label=cluster=east-1 \ # External label to be applied for each rule
-external.label=replica=a \ # Multiple external labels may be set
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
```
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
@@ -80,7 +90,7 @@ rules:
There are two types of Rules:
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL)
Alerting rules allows to define alert conditions via [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allow you to precompute frequently needed or computationally expensive expressions
@@ -96,7 +106,13 @@ The syntax for alerting rule is following:
# The name of the alert. Must be a valid metric name.
alert: <string>
# The MetricsQL expression to evaluate.
# Optional type for the rule. Supported values: "graphite", "prometheus".
# By default "prometheus" rule type is used.
[ type: <string> ]
# The expression to evaluate. The expression language depends on the type value.
# By default MetricsQL expression is used. If type="graphite", then the expression
# must contain valid Graphite expression.
expr: <string>
# Alerts are considered firing once they have been returned for this long.
@@ -112,14 +128,6 @@ annotations:
[ <labelname>: <tmpl_string> ]
```
`vmalert` has no local storage and alerts state is stored in process memory. Hence, after reloading of `vmalert` process
alerts state will be lost. To avoid this situation, `vmalert` may be configured via following flags:
* `-remoteWrite.url` - URL to Victoria Metrics or VMInsert. `vmalert` will persist alerts state into the configured
address in form of timeseries with name `ALERTS` via remote-write protocol.
* `-remoteRead.url` - URL to Victoria Metrics or VMSelect. `vmalert` will try to restore alerts state from configured
address by querying `ALERTS` timeseries.
##### Recording rules
The syntax for recording rules is following:
@@ -127,7 +135,13 @@ The syntax for recording rules is following:
# The name of the time series to output to. Must be a valid metric name.
record: <string>
# The MetricsQL expression to evaluate.
# Optional type for the rule. Supported values: "graphite", "prometheus".
# By default "prometheus" rule type is used.
[ type: <string> ]
# The expression to evaluate. The expression language depends on the type value.
# By default MetricsQL expression is used. If type="graphite", then the expression
# must contain valid Graphite expression.
expr: <string>
# Labels to add or overwrite before storing the result.
@@ -138,6 +152,22 @@ labels:
For recording rules to work `-remoteWrite.url` must specified.
#### Alerts state on restarts
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert`
the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags:
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or VMInsert (Cluster). `vmalert` will persist alerts state
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
These are regular time series and may be queried from VM just as any other time series.
The state stored to the configured address on every rule evaluation.
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or VMSelect (Cluster). `vmalert` will try to restore alerts state
from configured address by querying time series with name `ALERTS_FOR_STATE`.
Both flags are required for the proper state restoring. Restore process may fail if time series are missing
in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert`
rules configuration.
#### WEB
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
@@ -149,98 +179,181 @@ Used as alert source in AlertManager.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
### Graphite
vmalert sends requests to `<-datasource.url>/render?format=json` during evaluation of alerting and recording rules
if the corresponding rule contains `type: "graphite"` config option. It is expected that the `<-datasource.url>/render`
implements [Graphite Render API](https://graphite.readthedocs.io/en/stable/render_api.html) for `format=json`.
### Configuration
The shortlist of configuration flags is the following:
```
Usage of vmalert:
-datasource.basicAuth.password string
Optional basic auth password for -datasource.url
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used.
-datasource.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -datasource.url.
Optional basic auth username for -datasource.url
-datasource.lookback duration
Lookback defines how far to look into past when evaluating queries. For example, if datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
-datasource.maxIdleConnections int
Defines the number of idle (keep-alive connections) to configured datasource.Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state. (default 100)
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used
-datasource.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -datasource.url.
-datasource.tlsServerName value
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used.
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
-datasource.tlsServerName string
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
-dryRun -rule
Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
How often to evaluate the rules (default 1m0s)
-external.alert.source string
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
-external.label array
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
Supports array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier
External URL is used as alert's source for sent alerts to the notifier
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
Address to listen for http connections (default ":8880")
-loggerDisableTimestamps
Whether to disable writing timestamps in logs
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-loggerWarnsPerSecondLimit int
Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero value disables the rate limit
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-notifier.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used.
-notifier.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -notifier.url.
-notifier.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -notifier.url
-notifier.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -notifier.url.
-notifier.tlsServerName value
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used.
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
Auth key for /metrics. It overrides httpAuth settings
-notifier.basicAuth.password array
Optional basic auth password for -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.basicAuth.username array
Optional basic auth username for -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsInsecureSkipVerify array
Whether to skip tls verification when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsKeyFile array
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsServerName array
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
Supports array of values separated by comma or specified via multiple flags.
-notifier.url array
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
Supports array of values separated by comma or specified via multiple flags.
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
Optional basic auth password for -remoteRead.url
-remoteRead.basicAuth.username string
Optional basic auth username for -remoteRead.url
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used.
-remoteRead.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url.
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
-remoteRead.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
-remoteRead.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url.
-remoteRead.tlsServerName value
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used.
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
-remoteRead.tlsServerName string
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
Optional basic auth username for -remoteWrite.url
-remoteWrite.concurrency int
Defines number of readers that concurrently write into remote storage (default 1)
Defines number of writers for concurrent writing into remote querier (default 1)
-remoteWrite.flushInterval duration
Defines interval of flushes to remote write endpoint (default 5s)
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
Defines defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile value
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used.
-remoteWrite.tlsCertFile value
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url.
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
-remoteWrite.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
-remoteWrite.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile value
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url.
-remoteWrite.tlsServerName value
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used.
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
-remoteWrite.tlsServerName string
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Optional URL to Victoria Metrics or VMInsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
-rule array
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Supports array of values separated by comma or specified via multiple flags.
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Whether to validate annotation and label templates (default true)
Whether to validate annotation and label templates (default true)
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
```
Pass `-help` to `vmalert` in order to see the full list of supported
@@ -273,3 +386,20 @@ It is recommended using
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-prod` from the root folder of the repository.
It builds `vmalert-prod` binary and puts it into the `bin` folder.
#### ARM build
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
#### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of the repository.
It builds `vmalert-arm` or `vmalert-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of the repository.
It builds `vmalert-arm-prod` or `vmalert-arm64-prod` binary respectively and puts it into the `bin` folder.

View File

@@ -14,10 +14,12 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
// AlertingRule is basic alert entity
type AlertingRule struct {
Type datasource.Type
RuleID uint64
Name string
Expr string
@@ -25,6 +27,7 @@ type AlertingRule struct {
Labels map[string]string
Annotations map[string]string
GroupID uint64
GroupName string
// guard status fields
mu sync.RWMutex
@@ -36,19 +39,73 @@ type AlertingRule struct {
// resets on every successful Exec
// may be used as Health state
lastExecError error
metrics *alertingRuleMetrics
}
func newAlertingRule(gID uint64, cfg config.Rule) *AlertingRule {
return &AlertingRule{
type alertingRuleMetrics struct {
errors *gauge
pending *gauge
active *gauge
}
func newAlertingRule(group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
Type: cfg.Type,
RuleID: cfg.ID,
Name: cfg.Alert,
Expr: cfg.Expr,
For: cfg.For,
For: cfg.For.Duration(),
Labels: cfg.Labels,
Annotations: cfg.Annotations,
GroupID: gID,
GroupID: group.ID(),
GroupName: group.Name,
alerts: make(map[uint64]*notifier.Alert),
metrics: &alertingRuleMetrics{},
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StatePending {
num++
}
}
return float64(num)
})
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StateFiring {
num++
}
}
return float64(num)
})
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
if ar.lastExecError == nil {
return 0
}
return 1
})
return ar
}
// Close unregisters rule metrics
func (ar *AlertingRule) Close() {
metrics.UnregisterMetric(ar.metrics.active.name)
metrics.UnregisterMetric(ar.metrics.pending.name)
metrics.UnregisterMetric(ar.metrics.errors.name)
}
// String implements Stringer interface
@@ -65,14 +122,14 @@ func (ar *AlertingRule) ID() uint64 {
// Exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := q.Query(ctx, ar.Expr)
qMetrics, err := q.Query(ctx, ar.Expr, ar.Type)
ar.mu.Lock()
defer ar.mu.Unlock()
ar.lastExecError = err
ar.lastExecTime = time.Now()
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %s", ar.Expr, err)
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
for h, a := range ar.alerts {
@@ -82,28 +139,44 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
}
}
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
// extra labels could contain templates, so we expand them first
labels, err := expandLabels(m, qFn, ar)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
for k, v := range labels {
// apply extra labels to datasource
// so the hash key will be consistent on restore
m.SetLabel(k, v)
}
h := hash(m)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
return nil, fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
if a.Value != m.Value {
// update Value field with latest value
a.Value = m.Value
// and re-exec template since Value can be used
// in templates
err = ar.template(a)
// in annotations
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
if err != nil {
return nil, err
}
}
continue
}
a, err := ar.newAlert(m, ar.lastExecTime)
a, err := ar.newAlert(m, ar.lastExecTime, qFn)
if err != nil {
ar.lastExecError = err
return nil, fmt.Errorf("failed to create alert: %s", err)
return nil, fmt.Errorf("failed to create alert: %w", err)
}
a.ID = h
a.State = notifier.StatePending
@@ -134,6 +207,19 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
return nil, nil
}
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
metricLabels := make(map[string]string)
for _, l := range m.Labels {
metricLabels[l.Name] = l.Value
}
tpl := notifier.AlertTplData{
Labels: metricLabels,
Value: m.Value,
Expr: ar.Expr,
}
return notifier.ExecTemplate(q, ar.Labels, tpl)
}
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts {
@@ -180,7 +266,7 @@ func hash(m datasource.Metric) uint64 {
return hash.Sum64()
}
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifier.Alert, error) {
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
a := &notifier.Alert{
GroupID: ar.GroupID,
Name: ar.Name,
@@ -189,6 +275,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifie
Start: start,
Expr: ar.Expr,
}
// label defined here to make override possible by
// time series labels.
a.Labels[alertGroupNameLabel] = ar.GroupName
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
@@ -196,31 +285,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifie
}
a.Labels[l.Name] = l.Value
}
return a, ar.template(a)
}
func (ar *AlertingRule) template(a *notifier.Alert) error {
// 1. template rule labels with data labels
rLabels, err := a.ExecTemplate(ar.Labels)
if err != nil {
return err
}
// 2. merge data labels and rule labels
// metric labels may be overridden by
// rule labels
for k, v := range rLabels {
a.Labels[k] = v
}
// 3. template merged labels
a.Labels, err = a.ExecTemplate(a.Labels)
if err != nil {
return err
}
a.Annotations, err = a.ExecTemplate(ar.Annotations)
return err
var err error
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
return a, err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
@@ -245,6 +312,7 @@ func (ar *AlertingRule) RuleAPI() APIAlertingRule {
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
Type: ar.Type.String(),
Name: ar.Name,
Expression: ar.Expr,
For: ar.For.String(),
@@ -283,15 +351,18 @@ func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
}
const (
// AlertMetricName is the metric name for synthetic alert timeseries.
// alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
// AlertForStateMetricName is the metric name for 'for' state of alert.
// alertForStateMetricName is the metric name for 'for' state of alert.
alertForStateMetricName = "ALERTS_FOR_STATE"
// AlertNameLabel is the label name indicating the name of an alert.
// alertNameLabel is the label name indicating the name of an alert.
alertNameLabel = "alertname"
// AlertStateLabel is the label name indicating the state of an alert.
// alertStateLabel is the label name indicating the state of an alert.
alertStateLabel = "alertstate"
// alertGroupNameLabel defines the label name attached for generated time series.
alertGroupNameLabel = "alertgroup"
)
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
@@ -331,16 +402,25 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
// Only rules with For > 0 will be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
// Get the last datapoint in range via MetricsQL `last_over_time`.
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query, ar.Type) }
// account for external labels in filter
var labelsFilter string
for k, v := range labels {
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
}
// Get the last data point in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr, ar.Type)
if err != nil {
return err
}
@@ -349,26 +429,22 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Exec
// be identical to time series received in Exec
for _, l := range labels {
if l.Name == alertNameLabel {
continue
}
// drop all overridden labels
if _, ok := ar.Labels[l.Name]; ok {
if l.Name == alertNameLabel || l.Name == alertGroupNameLabel {
continue
}
m.Labels = append(m.Labels, l)
}
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0))
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0), qFn)
if err != nil {
return fmt.Errorf("failed to create alert: %s", err)
return fmt.Errorf("failed to create alert: %w", err)
}
a.ID = hash(m)
a.State = notifier.StatePending
ar.alerts[a.ID] = a
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
}
return nil
}

View File

@@ -2,6 +2,9 @@ package main
import (
"context"
"errors"
"reflect"
"strings"
"testing"
"time"
@@ -218,19 +221,6 @@ func TestAlertingRule_Exec(t *testing.T) {
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("duplicate", 0),
[][]datasource.Metric{
{
// metrics with the same labelset should result in one alert
metricWithLabels(t, "name", "foo", "type", "bar"),
metricWithLabels(t, "type", "bar", "name", "foo"),
},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo", "type", "bar")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("for-pending", time.Minute),
[][]datasource.Metric{
@@ -355,6 +345,7 @@ func TestAlertingRule_Restore(t *testing.T) {
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
alertGroupNameLabel, "groupID",
"foo", "bar",
"namespace", "baz",
),
@@ -375,7 +366,7 @@ func TestAlertingRule_Restore(t *testing.T) {
alertNameLabel, "",
"foo", "bar",
"namespace", "baz",
// following pair supposed to be dropped
// extra labels set by rule
"source", "vm",
),
},
@@ -383,6 +374,7 @@ func TestAlertingRule_Restore(t *testing.T) {
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
"source", "vm",
)): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
@@ -419,7 +411,7 @@ func TestAlertingRule_Restore(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
t.Fatalf("unexpected err: %s", err)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
@@ -441,6 +433,138 @@ func TestAlertingRule_Restore(t *testing.T) {
}
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), fq, false)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), fq, false)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.reset()
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), fq, false)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
}
func TestAlertingRule_Template(t *testing.T) {
testCases := []struct {
rule *AlertingRule
metrics []datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestRuleWithLabels("common", "region", "east"),
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "instance", "foo"),
metricWithValueAndLabels(t, 1, "instance", "bar"),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
Annotations: map[string]string{},
Labels: map[string]string{
alertGroupNameLabel: "",
"region": "east",
"instance": "foo",
},
},
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
Annotations: map[string]string{},
Labels: map[string]string{
alertGroupNameLabel: "",
"region": "east",
"instance": "bar",
},
},
},
},
{
&AlertingRule{
Name: "override label",
Labels: map[string]string{
"instance": "{{ $labels.instance }}",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`,
"description": `It is {{ $value }} connections for "{{ $labels.instance }}"`,
},
alerts: make(map[uint64]*notifier.Alert),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "instance", "foo"),
metricWithValueAndLabels(t, 10, "instance", "bar"),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "region", "east", "instance", "foo")): {
Labels: map[string]string{
alertGroupNameLabel: "",
"instance": "foo",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "foo" for region east`,
"description": `It is 2 connections for "foo"`,
},
},
hash(metricWithLabels(t, "region", "east", "instance", "bar")): {
Labels: map[string]string{
alertGroupNameLabel: "",
"instance": "bar",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "bar" for region east`,
"description": `It is 10 connections for "bar"`,
},
},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
gotAlert := tc.rule.alerts[hash]
if gotAlert == nil {
t.Fatalf("alert %d is missing; labels: %v; annotations: %v",
hash, expAlert.Labels, expAlert.Annotations)
}
if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations)
}
if !reflect.DeepEqual(expAlert.Labels, gotAlert.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", expAlert.Labels, gotAlert.Labels)
}
}
})
}
}
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
r := newTestAlertingRule(name, 0)
r.Labels = make(map[string]string)

View File

@@ -1,6 +1,7 @@
package config
import (
"crypto/md5"
"fmt"
"hash/fnv"
"io/ioutil"
@@ -9,7 +10,12 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metricsql"
"gopkg.in/yaml.v2"
)
@@ -17,16 +23,49 @@ import (
// Group contains list of Rules grouped into
// entity with one name and evaluation interval
type Group struct {
Type datasource.Type `yaml:"type,omitempty"`
File string
Name string `yaml:"name"`
Interval time.Duration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
// Checksum stores the hash of yaml definition for this group.
// May be used to detect any changes like rules re-ordering etc.
Checksum string
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (g *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
type group Group
if err := unmarshal((*group)(g)); err != nil {
return err
}
b, err := yaml.Marshal(g)
if err != nil {
return fmt.Errorf("failed to marshal group configuration for checksum: %w", err)
}
// change default value to prometheus datasource.
if g.Type.Get() == "" {
g.Type.Set(datasource.NewPrometheusType())
}
// update rules with empty type.
for i, r := range g.Rules {
if r.Type.Get() == "" {
r.Type.Set(g.Type)
r.ID = HashRule(r)
g.Rules[i] = r
}
}
h := md5.New()
h.Write(b)
g.Checksum = fmt.Sprintf("%x", h.Sum(nil))
return nil
}
// Validate check for internal Group or Rule configuration errors
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
if g.Name == "" {
@@ -35,6 +74,7 @@ func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
if len(g.Rules) == 0 {
return fmt.Errorf("group %q can't contain no rules", g.Name)
}
uniqueRules := map[uint64]struct{}{}
for _, r := range g.Rules {
ruleName := r.Record
@@ -46,19 +86,25 @@ func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
}
uniqueRules[r.ID] = struct{}{}
if err := r.Validate(); err != nil {
return fmt.Errorf("invalid rule %q.%q: %s", g.Name, ruleName, err)
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
}
if validateExpressions {
if _, err := metricsql.Parse(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q.%q: %s", g.Name, ruleName, err)
// its needed only for tests.
// because correct types must be inherited after unmarshalling.
exprValidator := g.Type.ValidateExpr
if r.Type.Get() != "" {
exprValidator = r.Type.ValidateExpr
}
if err := exprValidator(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
}
}
if validateAnnotations {
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
return fmt.Errorf("invalid annotations for rule %q.%q: %s", g.Name, ruleName, err)
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
}
if err := notifier.ValidateTemplates(r.Labels); err != nil {
return fmt.Errorf("invalid labels for rule %q.%q: %s", g.Name, ruleName, err)
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
}
}
}
@@ -69,10 +115,11 @@ func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
// recording rule or alerting rule.
type Rule struct {
ID uint64
Type datasource.Type `yaml:"type,omitempty"`
Record string `yaml:"record,omitempty"`
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For time.Duration `yaml:"for,omitempty"`
For PromDuration `yaml:"for"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
@@ -80,6 +127,42 @@ type Rule struct {
XXX map[string]interface{} `yaml:",inline"`
}
// PromDuration is Prometheus duration.
type PromDuration struct {
milliseconds int64
}
// NewPromDuration returns PromDuration for given d.
func NewPromDuration(d time.Duration) PromDuration {
return PromDuration{
milliseconds: d.Milliseconds(),
}
}
// MarshalYAML implements yaml.Marshaler interface.
func (pd PromDuration) MarshalYAML() (interface{}, error) {
return pd.Duration().String(), nil
}
// UnmarshalYAML implements yaml.Unmarshaler interface.
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var s string
if err := unmarshal(&s); err != nil {
return err
}
ms, err := metricsql.DurationValue(s, 0)
if err != nil {
return err
}
pd.milliseconds = ms
return nil
}
// Duration returns duration for pd.
func (pd *PromDuration) Duration() time.Duration {
return time.Duration(pd.milliseconds) * time.Millisecond
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rule Rule
@@ -90,8 +173,16 @@ func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
return nil
}
// Name returns Rule name according to its type
func (r *Rule) Name() string {
if r.Record != "" {
return r.Record
}
return r.Alert
}
// HashRule hashes significant Rule fields into
// unique hash value
// unique hash that supposed to define Rule uniqueness
func HashRule(r Rule) uint64 {
h := fnv.New64a()
h.Write([]byte(r.Expr))
@@ -102,16 +193,8 @@ func HashRule(r Rule) uint64 {
h.Write([]byte("alerting"))
h.Write([]byte(r.Alert))
}
type item struct {
key, value string
}
var kv []item
for k, v := range r.Labels {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
h.Write([]byte(r.Type.Get()))
kv := sortMap(r.Labels)
for _, i := range kv {
h.Write([]byte(i.key))
h.Write([]byte(i.value))
@@ -137,31 +220,38 @@ func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool)
for _, pattern := range pathPatterns {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("error reading file pattern %s: %v", pattern, err)
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
}
fp = append(fp, matches...)
}
errGroup := new(utils.ErrGroup)
var groups []Group
for _, file := range fp {
uniqueGroups := map[string]struct{}{}
gr, err := parseFile(file)
if err != nil {
return nil, fmt.Errorf("failed to parse file %q: %w", file, err)
errGroup.Add(fmt.Errorf("failed to parse file %q: %w", file, err))
continue
}
for _, g := range gr {
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
return nil, fmt.Errorf("invalid group %q in file %q: %s", g.Name, file, err)
errGroup.Add(fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err))
continue
}
if _, ok := uniqueGroups[g.Name]; ok {
return nil, fmt.Errorf("group name %q duplicate in file %q", g.Name, file)
errGroup.Add(fmt.Errorf("group name %q duplicate in file %q", g.Name, file))
continue
}
uniqueGroups[g.Name] = struct{}{}
g.File = file
groups = append(groups, g)
}
}
if err := errGroup.Err(); err != nil {
return nil, err
}
if len(groups) < 1 {
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))
logger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}
@@ -171,6 +261,7 @@ func parseFile(path string) ([]Group, error) {
if err != nil {
return nil, fmt.Errorf("error reading alert rule file: %w", err)
}
data = envtemplate.Replace(data)
g := struct {
Groups []Group `yaml:"groups"`
// Catches all undefined fields and must be empty after parsing.
@@ -193,3 +284,18 @@ func checkOverflow(m map[string]interface{}, ctx string) error {
}
return nil
}
type item struct {
key, value string
}
func sortMap(m map[string]string) []item {
var kv []item
for k, v := range m {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
return kv
}

View File

@@ -7,6 +7,10 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
@@ -41,7 +45,7 @@ func TestParseBad(t *testing.T) {
},
{
[]string{"testdata/dir/rules2-bad.rules"},
"function \"value\" not defined",
"function \"unknown\" not defined",
},
{
[]string{"testdata/dir/rules3-bad.rules"},
@@ -52,8 +56,8 @@ func TestParseBad(t *testing.T) {
"either `record` or `alert` must be set",
},
{
[]string{"testdata/*.yaml"},
"no groups found",
[]string{"testdata/rules1-bad.rules"},
"bad graphite expr",
},
}
for _, tc := range testCases {
@@ -70,13 +74,13 @@ func TestParseBad(t *testing.T) {
func TestRule_Validate(t *testing.T) {
if err := (&Rule{}).Validate(); err == nil {
t.Errorf("exptected empty name error")
t.Errorf("expected empty name error")
}
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
t.Errorf("exptected empty expr error")
t.Errorf("expected empty expr error")
}
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
t.Errorf("exptected valid rule; got %s", err)
t.Errorf("expected valid rule; got %s", err)
}
}
@@ -140,12 +144,14 @@ func TestGroup_Validate(t *testing.T) {
Alert: "alert",
Expr: "up == 1",
Labels: map[string]string{
"summary": "{{ value|query }}",
"summary": `
{{ with printf "node_memory_MemTotal{job='node',instance='%s'}" "localhost" | query }}
{{ . | first | value | humanize1024 }}B
{{ end }}`,
},
},
},
},
expErr: "error parsing annotation",
validateAnnotations: true,
},
{
@@ -215,6 +221,75 @@ func TestGroup_Validate(t *testing.T) {
},
expErr: "",
},
{
group: &Group{Name: "test thanos",
Type: datasource.NewRawType("thanos"),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"description": "{{ value|query }}",
}},
},
},
validateExpressions: true,
expErr: "unknown datasource type",
},
{
group: &Group{Name: "test graphite",
Type: datasource.NewGraphiteType(),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"description": "some-description",
}},
},
},
validateExpressions: true,
expErr: "",
},
{
group: &Group{Name: "test prometheus",
Type: datasource.NewPrometheusType(),
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"description": "{{ value|query }}",
}},
},
},
validateExpressions: true,
expErr: "",
},
{
group: &Group{
Name: "test graphite inherit",
Type: datasource.NewGraphiteType(),
Rules: []Rule{
{
Expr: "sumSeries(time('foo.bar',10))",
For: PromDuration{milliseconds: 10},
},
{
Expr: "sum(up == 0 ) by (host)",
Type: datasource.NewPrometheusType(),
},
},
},
},
{
group: &Group{
Name: "test graphite prometheus bad expr",
Type: datasource.NewGraphiteType(),
Rules: []Rule{
{
Expr: "sum(up == 0 ) by (host)",
For: PromDuration{milliseconds: 10},
},
{
Expr: "sumSeries(time('foo.bar',10))",
Type: datasource.NewPrometheusType(),
},
},
},
expErr: "invalid rule",
},
}
for _, tc := range testCases {
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
@@ -273,7 +348,7 @@ func TestHashRule(t *testing.T) {
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", For: time.Minute},
Rule{Alert: "alert", Expr: "up == 1", For: NewPromDuration(time.Minute)},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},
@@ -324,3 +399,57 @@ func TestHashRule(t *testing.T) {
}
}
}
func TestGroupChecksum(t *testing.T) {
f := func(t *testing.T, data, newData string) {
t.Helper()
var g Group
if err := yaml.Unmarshal([]byte(data), &g); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == "" {
t.Fatalf("expected to get non-empty checksum")
}
var ng Group
if err := yaml.Unmarshal([]byte(newData), &ng); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == ng.Checksum {
t.Fatalf("expected to get different checksums")
}
}
t.Run("Ok", func(t *testing.T) {
f(t, `
name: TestGroup
rules:
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
`, `
name: TestGroup
rules:
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
`)
})
t.Run("Ok, `for` must change cs", func(t *testing.T) {
f(t, `
name: TestGroup
rules:
- alert: ExampleAlertWithFor
expr: sum by(job) (up == 1)
for: 5m
`, `
name: TestGroup
rules:
- alert: ExampleAlertWithFor
expr: sum by(job) (up == 1)
`)
})
}

View File

@@ -0,0 +1,13 @@
groups:
- name: TestUpdateGroup
interval: 2s
concurrency: 2
type: prometheus
rules:
- alert: up
expr: up == 0
for: 30s
- alert: up graphite
expr: filterSeries(time('host.1',20),'>','0')
for: 30s
type: graphite

View File

@@ -0,0 +1,12 @@
groups:
- name: TestUpdateGroup
interval: 30s
type: graphite
rules:
- alert: up
expr: filterSeries(time('host.2',20),'>','0')
for: 30s
- alert: up graphite
expr: filterSeries(time('host.1',20),'>','0')
for: 30s
type: graphite

View File

@@ -6,6 +6,6 @@ groups:
expr: vm_rows > 0
labels:
label: bar
summary: "{{ value|query }}"
summary: "{{ unknown|query }}"
annotations:
description: "{{$labels}}"

View File

@@ -665,7 +665,7 @@
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
for: 1d
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
@@ -1724,4 +1724,4 @@
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
severity: warning

View File

@@ -0,0 +1,12 @@
groups:
- name: TestGraphiteBadGroup
interval: 2s
concurrency: 2
type: graphite
rules:
- alert: Conns
expr: filterSeries(sumSeries(host.receiver.interface.cons),'last','>', 500) by instance
for: 3m
annotations:
summary: Too high connection number for {{$labels.instance}}
description: "It is {{ $value }} connections for {{$labels.instance}}"

View File

@@ -7,11 +7,22 @@ groups:
expr: sum(vm_tcplistener_conns) by(instance) > 1
for: 3m
annotations:
summary: "Too high connection number for {{$labels.instance}}"
summary: Too high connection number for {{$labels.instance}}
{{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }}
{{ . | first | value }}
{{ end }}
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: ExampleAlertAlwaysFiring
expr: sum by(job)
(up == 1)
labels:
job: '{{ $labels.job }}'
dynamic: '{{ $x := query "up" | first | value }}{{ if eq 1.0 $x }}one{{ else }}unknown{{ end }}'
annotations:
description: Job {{ $labels.job }} is up!
summary: All instances up {{ range query "up" }}
{{ . | label "instance" }}
{{ end }}
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
labels:

View File

@@ -0,0 +1,30 @@
groups:
- name: TestGroup
interval: 2s
concurrency: 2
type: graphite
rules:
- alert: Conns
expr: filterSeries(sumSeries(host.receiver.interface.cons),'last','>', 500)
for: 3m
annotations:
summary: Too high connection number for {{$labels.instance}}
description: "It is {{ $value }} connections for {{$labels.instance}}"
- name: TestGroupPromMixed
interval: 2s
concurrency: 2
type: prometheus
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by (instance) > 1
for: 3m
annotations:
summary: Too high connection number for {{$labels.instance}}
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: HostDown
type: graphite
expr: filterSeries(sumSeries(host.receiver.interface.up),'last','=', 0)
for: 3m
annotations:
summary: Too high connection number for {{$labels.instance}}
description: "It is {{ $value }} connections for {{$labels.instance}}"

View File

@@ -1,12 +1,14 @@
package datasource
import "context"
import (
"context"
)
// Querier interface wraps Query method which
// executes given query and returns list of Metrics
// as result
type Querier interface {
Query(ctx context.Context, query string) ([]Metric, error)
Query(ctx context.Context, query string, engine Type) ([]Metric, error)
}
// Metric is the basic entity which should be return by datasource
@@ -17,6 +19,34 @@ type Metric struct {
Value float64
}
// SetLabel adds or updates existing one label
// by the given key and label
func (m *Metric) SetLabel(key, value string) {
for i, l := range m.Labels {
if l.Name == key {
m.Labels[i].Value = value
return
}
}
m.AddLabel(key, value)
}
// AddLabel appends the given label to the label set
func (m *Metric) AddLabel(key, value string) {
m.Labels = append(m.Labels, Label{Name: key, Value: value})
}
// Label returns the given label value.
// If label is missing empty string will be returned
func (m *Metric) Label(key string) string {
for _, l := range m.Labels {
if l.Name == key {
return l.Value
}
}
return ""
}
// Label represents metric's label
type Label struct {
Name string

View File

@@ -0,0 +1,45 @@
package datasource
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
"By default system CA is used")
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
"By default the server name from -datasource.url is used")
lookBack = flag.Duration("datasource.lookback", 0, "Lookback defines how far to look into past when evaluating queries. "+
"For example, if datasource.lookback=5m then param \"time\" with value now()-5m will be added to every query.")
queryStep = flag.Duration("datasource.queryStep", 0, "queryStep defines how far a value can fallback to when evaluating queries. "+
"For example, if datasource.queryStep=15s then param \"step\" with value \"15s\" will be added to every query.")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, "Defines the number of idle (keep-alive connections) to configured datasource."+
"Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state.")
)
// Init creates a Querier from provided flag values.
func Init() (Querier, error) {
if *addr == "" {
return nil, fmt.Errorf("datasource.url is empty")
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
tr.MaxIdleConns = *maxIdleConnections
c := &http.Client{Transport: tr}
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, *lookBack, *queryStep, c), nil
}

View File

@@ -0,0 +1,89 @@
package datasource
import (
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/graphiteql"
"github.com/VictoriaMetrics/metricsql"
)
const graphiteType = "graphite"
const prometheusType = "prometheus"
// Type represents data source type
type Type struct {
name string
}
// NewPrometheusType returns prometheus datasource type
func NewPrometheusType() Type {
return Type{name: prometheusType}
}
// NewGraphiteType returns graphite datasource type
func NewGraphiteType() Type {
return Type{name: graphiteType}
}
// NewRawType returns datasource type from raw string
// without validation.
func NewRawType(d string) Type {
return Type{name: d}
}
// Get returns datasource type
func (t *Type) Get() string {
return t.name
}
// Set changes datasource type
func (t *Type) Set(d Type) {
t.name = d.name
}
// String implements String interface with default value.
func (t Type) String() string {
if t.name == "" {
return prometheusType
}
return t.name
}
// ValidateExpr validates query expression with datasource ql.
func (t *Type) ValidateExpr(expr string) error {
switch t.name {
case graphiteType:
if _, err := graphiteql.Parse(expr); err != nil {
return fmt.Errorf("bad graphite expr: %q, err: %w", expr, err)
}
case "", prometheusType:
if _, err := metricsql.Parse(expr); err != nil {
return fmt.Errorf("bad prometheus expr: %q, err: %w", expr, err)
}
default:
return fmt.Errorf("unknown datasource type=%q", t.name)
}
return nil
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *Type) UnmarshalYAML(unmarshal func(interface{}) error) error {
var s string
if err := unmarshal(&s); err != nil {
return err
}
switch s {
case "":
s = prometheusType
case graphiteType, prometheusType:
default:
return fmt.Errorf("unknown datasource type=%q, want %q or %q", s, prometheusType, graphiteType)
}
t.name = s
return nil
}
// MarshalYAML implements the yaml.Unmarshaler interface.
func (t Type) MarshalYAML() (interface{}, error) {
return t.name, nil
}

View File

@@ -6,9 +6,9 @@ import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
type response struct {
@@ -32,11 +32,11 @@ func (r response) metrics() ([]Metric, error) {
for i, res := range r.Data.Result {
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %s", res, res.TV[1], err)
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
}
m.Labels = nil
for k, v := range r.Data.Result[i].Labels {
m.Labels = append(m.Labels, Label{Name: k, Value: v})
m.AddLabel(k, v)
}
m.Timestamp = int64(res.TV[0].(float64))
m.Value = f
@@ -45,59 +45,151 @@ func (r response) metrics() ([]Metric, error) {
return ms, nil
}
const queryPath = "/api/v1/query?query="
type graphiteResponse []graphiteResponseTarget
type graphiteResponseTarget struct {
Target string `json:"target"`
Tags map[string]string `json:"tags"`
DataPoints [][2]float64 `json:"datapoints"`
}
func (r graphiteResponse) metrics() []Metric {
var ms []Metric
for _, res := range r {
if len(res.DataPoints) < 1 {
continue
}
var m Metric
// add only last value to the result.
last := res.DataPoints[len(res.DataPoints)-1]
m.Value = last[0]
m.Timestamp = int64(last[1])
for k, v := range res.Tags {
m.AddLabel(k, v)
}
ms = append(ms, m)
}
return ms
}
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct {
c *http.Client
queryURL string
basicAuthUser, basicAuthPass string
c *http.Client
datasourceURL string
basicAuthUser string
basicAuthPass string
lookBack time.Duration
queryStep time.Duration
}
const queryPath = "/api/v1/query"
const graphitePath = "/render"
// NewVMStorage is a constructor for VMStorage
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, c *http.Client) *VMStorage {
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Duration, queryStep time.Duration, c *http.Client) *VMStorage {
return &VMStorage{
c: c,
basicAuthUser: basicAuthUser,
basicAuthPass: basicAuthPass,
queryURL: strings.TrimSuffix(baseURL, "/") + queryPath,
datasourceURL: strings.TrimSuffix(baseURL, "/"),
lookBack: lookBack,
queryStep: queryStep,
}
}
// Query reads metrics from datasource by given query
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
const (
statusSuccess, statusError, rtVector = "success", "error", "vector"
)
req, err := http.NewRequest("POST", s.queryURL+url.QueryEscape(query), nil)
// Query reads metrics from datasource by given query and type
func (s *VMStorage) Query(ctx context.Context, query string, dataSourceType Type) ([]Metric, error) {
switch dataSourceType.name {
case "", prometheusType:
return s.queryDataSource(ctx, query, s.setPrometheusReqParams, parsePrometheusResponse)
case graphiteType:
return s.queryDataSource(ctx, query, s.setGraphiteReqParams, parseGraphiteResponse)
default:
return nil, fmt.Errorf("engine not found: %q", dataSourceType)
}
}
func (s *VMStorage) queryDataSource(
ctx context.Context,
query string,
setReqParams func(r *http.Request, query string),
processResponse func(r *http.Request, resp *http.Response,
) ([]Metric, error)) ([]Metric, error) {
req, err := http.NewRequest("POST", s.datasourceURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Content-Type", "application/json; charset=utf-8")
if s.basicAuthPass != "" {
req.SetBasicAuth(s.basicAuthUser, s.basicAuthPass)
}
setReqParams(req, query)
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s:%s", req.URL, err)
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
return nil, fmt.Errorf("datasource returns unxeprected response code %d for %s with err %s. Reponse body %s", resp.StatusCode, req.URL, err, body)
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
}
return processResponse(req, resp)
}
func (s *VMStorage) setPrometheusReqParams(r *http.Request, query string) {
r.URL.Path += queryPath
q := r.URL.Query()
q.Set("query", query)
if s.lookBack > 0 {
lookBack := time.Now().Add(-s.lookBack)
q.Set("time", fmt.Sprintf("%d", lookBack.Unix()))
}
if s.queryStep > 0 {
q.Set("step", s.queryStep.String())
}
r.URL.RawQuery = q.Encode()
}
func (s *VMStorage) setGraphiteReqParams(r *http.Request, query string) {
r.URL.Path += graphitePath
q := r.URL.Query()
q.Set("format", "json")
q.Set("target", query)
from := "-5min"
if s.lookBack > 0 {
lookBack := time.Now().Add(-s.lookBack)
from = strconv.FormatInt(lookBack.Unix(), 10)
}
q.Set("from", from)
q.Set("until", "now")
r.URL.RawQuery = q.Encode()
}
const (
statusSuccess, statusError, rtVector = "success", "error", "vector"
)
func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &response{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing metrics for %s:%s", req.URL, err)
return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unkown status:%s, Expected success or error ", r.Status)
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
if r.Data.ResultType != rtVector {
return nil, fmt.Errorf("unkown restul type:%s. Expected vector", r.Data.ResultType)
return nil, fmt.Errorf("unknown result type:%s. Expected vector", r.Data.ResultType)
}
return r.metrics()
}
func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) {
r := &graphiteResponse{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err)
}
return r.metrics(), nil
}

View File

@@ -4,7 +4,9 @@ import (
"context"
"net/http"
"net/http/httptest"
"strconv"
"testing"
"time"
)
var (
@@ -12,6 +14,7 @@ var (
basicAuthName = "foo"
basicAuthPass = "bar"
query = "vm_rows"
queryRender = "constantLine(10)"
)
func TestVMSelectQuery(t *testing.T) {
@@ -20,6 +23,13 @@ func TestVMSelectQuery(t *testing.T) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/render", func(w http.ResponseWriter, request *http.Request) {
c++
switch c {
case 7:
w.Write([]byte(`[{"target":"constantLine(10)","tags":{"name":"constantLine(10)"},"datapoints":[[10,1611758343],[10,1611758373],[10,1611758403]]}]`))
}
})
mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
@@ -29,7 +39,14 @@ func TestVMSelectQuery(t *testing.T) {
t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass)
}
if r.URL.Query().Get("query") != query {
t.Errorf("exptected %s in query param, got %s", query, r.URL.Query().Get("query"))
t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query"))
}
timeParam := r.URL.Query().Get("time")
if timeParam == "" {
t.Errorf("expected 'time' in query param, got nil instead")
}
if _, err := strconv.ParseInt(timeParam, 10, 64); err != nil {
t.Errorf("failed to parse 'time' query param: %s", err)
}
switch c {
case 0:
@@ -52,31 +69,31 @@ func TestVMSelectQuery(t *testing.T) {
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, srv.Client())
if _, err := am.Query(ctx, query); err == nil {
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, 0, srv.Client())
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected connection error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected invalid response status error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected response body error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected error status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected unkown status got nil")
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected unknown status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
if _, err := am.Query(ctx, query, NewPrometheusType()); err == nil {
t.Fatalf("expected non-vector resultType error got nil")
}
m, err := am.Query(ctx, query)
m, err := am.Query(ctx, query, NewPrometheusType())
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("exptected 1 metric got %d in %+v", len(m), m)
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
expected := Metric{
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
@@ -89,5 +106,22 @@ func TestVMSelectQuery(t *testing.T) {
m[0].Labels[0].Name != expected.Labels[0].Name {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
m, err = am.Query(ctx, queryRender, NewGraphiteType())
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
expected = Metric{
Labels: []Label{{Value: "constantLine(10)", Name: "name"}},
Timestamp: 1611758403,
Value: 10,
}
if m[0].Timestamp != expected.Timestamp &&
m[0].Value != expected.Value &&
m[0].Labels[0].Value != expected.Labels[0].Value &&
m[0].Labels[0].Name != expected.Labels[0].Name {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
}

View File

@@ -11,6 +11,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
)
@@ -21,26 +22,46 @@ type Group struct {
Name string
File string
Rules []Rule
Type datasource.Type
Interval time.Duration
Concurrency int
Checksum string
doneCh chan struct{}
finishedCh chan struct{}
// channel accepts new Group obj
// which supposed to update current group
updateCh chan *Group
metrics *groupMetrics
}
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
type groupMetrics struct {
iterationTotal *counter
iterationDuration *summary
}
func newGroupMetrics(name, file string) *groupMetrics {
m := &groupMetrics{}
labels := fmt.Sprintf(`group=%q, file=%q`, name, file)
m.iterationTotal = getOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels))
m.iterationDuration = getOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels))
return m
}
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
}
g.metrics = newGroupMetrics(g.Name, g.File)
if g.Interval == 0 {
g.Interval = defaultInterval
}
@@ -49,6 +70,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
}
rules := make([]Rule, len(cfg.Rules))
for i, r := range cfg.Rules {
// override rule labels with external labels
for k, v := range labels {
if prevV, ok := r.Labels[k]; ok {
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
k, prevV, g.Name, r.Name(), k, v)
}
if r.Labels == nil {
r.Labels = map[string]string{}
}
r.Labels[k] = v
}
rules[i] = g.newRule(r)
}
g.Rules = rules
@@ -57,9 +89,9 @@ func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
func (g *Group) newRule(rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(g.ID(), rule)
return newAlertingRule(g, rule)
}
return newRecordingRule(g.ID(), rule)
return newRecordingRule(g, rule)
}
// ID return unique group ID that consists of
@@ -69,11 +101,12 @@ func (g *Group) ID() uint64 {
hash.Write([]byte(g.File))
hash.Write([]byte("\xff"))
hash.Write([]byte(g.Name))
hash.Write([]byte(g.Type.Get()))
return hash.Sum64()
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
for _, rule := range g.Rules {
rr, ok := rule.(*AlertingRule)
if !ok {
@@ -82,8 +115,8 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
if rr.For < 1 {
continue
}
if err := rr.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %s", rule, err)
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
return nil
@@ -105,6 +138,7 @@ func (g *Group) updateWith(newGroup *Group) error {
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i].Close()
g.Rules[i] = nil
continue
}
@@ -126,25 +160,17 @@ func (g *Group) updateWith(newGroup *Group) error {
for _, nr := range rulesRegistry {
newRules = append(newRules, nr)
}
g.Type = newGroup.Type
g.Concurrency = newGroup.Concurrency
g.Checksum = newGroup.Checksum
g.Rules = newRules
return nil
}
var (
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (g *Group) close() {
@@ -153,12 +179,41 @@ func (g *Group) close() {
}
close(g.doneCh)
<-g.finishedCh
metrics.UnregisterMetric(g.metrics.iterationDuration.name)
metrics.UnregisterMetric(g.metrics.iterationTotal.name)
for _, rule := range g.Rules {
rule.Close()
}
}
func (g *Group) start(ctx context.Context, querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
var skipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
defer func() { close(g.finishedCh) }()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(uint32(g.ID())) / (1 << 32)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
if randSleep < sleepOffset {
randSleep += uint64(g.Interval)
}
randSleep -= sleepOffset
sleepTimer := time.NewTimer(time.Duration(randSleep))
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case <-sleepTimer.C:
}
}
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
e := &executor{querier, nr, rw}
e := &executor{querier, nts, rw}
t := time.NewTicker(g.Interval)
defer t.Stop()
for {
@@ -185,7 +240,7 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nr notifi
g.mu.Unlock()
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
case <-t.C:
iterationTotal.Inc()
g.metrics.iterationTotal.Inc()
iterationStart := time.Now()
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
@@ -195,15 +250,15 @@ func (g *Group) start(ctx context.Context, querier datasource.Querier, nr notifi
}
}
iterationDuration.UpdateDuration(iterationStart)
g.metrics.iterationDuration.UpdateDuration(iterationStart)
}
}
}
type executor struct {
querier datasource.Querier
notifier notifier.Notifier
rw *remotewrite.Client
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
}
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
@@ -240,6 +295,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
return res
}
var (
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
execTotal.Inc()
execStart := time.Now()
@@ -250,15 +313,14 @@ func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, inter
tss, err := rule.Exec(ctx, e.querier, returnSeries)
if err != nil {
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %s", rule, err)
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
}
if len(tss) > 0 && e.rw != nil {
remoteWriteSent.Add(len(tss))
for _, ts := range tss {
if err := e.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
return fmt.Errorf("rule %q: remote write failure: %s", rule, err)
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
}
}
}
@@ -286,10 +348,14 @@ func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, inter
if len(alerts) < 1 {
return nil
}
alertsSent.Add(len(alerts))
if err := e.notifier.Send(ctx, alerts); err != nil {
alertsSendErrors.Inc()
return fmt.Errorf("rule %q: failed to send alerts: %s", rule, err)
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers {
if err := nt.Send(ctx, alerts); err != nil {
alertsSendErrors.Inc()
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
}
}
return nil
return errGr.Err()
}

View File

@@ -10,6 +10,12 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true
}
func TestUpdateWith(t *testing.T) {
testCases := []struct {
name string
@@ -26,7 +32,7 @@ func TestUpdateWith(t *testing.T) {
[]config.Rule{{
Alert: "foo",
Expr: "up > 0",
For: time.Second,
For: config.NewPromDuration(time.Second),
Labels: map[string]string{
"bar": "baz",
},
@@ -38,7 +44,7 @@ func TestUpdateWith(t *testing.T) {
[]config.Rule{{
Alert: "foo",
Expr: "up > 10",
For: time.Second,
For: config.NewPromDuration(time.Second),
Labels: map[string]string{
"baz": "bar",
},
@@ -150,7 +156,7 @@ func TestGroupStart(t *testing.T) {
t.Fatalf("failed to parse rules: %s", err)
}
const evalInterval = time.Millisecond
g := newGroup(groups[0], evalInterval)
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
fn := &fakeNotifier{}
@@ -161,25 +167,35 @@ func TestGroupStart(t *testing.T) {
m2 := metricWithLabels(t, "instance", inst2, "job", job)
r := g.Rules[0].(*AlertingRule)
alert1, err := r.newAlert(m1, time.Now())
alert1, err := r.newAlert(m1, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert1.State = notifier.StateFiring
// add external label
alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1
alert1.ID = hash(m1)
alert2, err := r.newAlert(m2, time.Now())
alert2, err := r.newAlert(m2, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert2.State = notifier.StateFiring
// add external label
alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2
alert2.ID = hash(m2)
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
go func() {
g.start(context.Background(), fs, fn, nil)
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
close(finished)
}()

View File

@@ -38,7 +38,7 @@ func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Unlock()
}
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ datasource.Type) ([]datasource.Metric, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {

View File

@@ -2,20 +2,18 @@ package main
import (
"context"
"crypto/tls"
"crypto/x509"
"flag"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@@ -24,7 +22,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
@@ -32,69 +29,26 @@ var (
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule /path/to/file. Path to a single file with alerting rules
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.`)
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.`)
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
datasourceTLSInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
datasourceTLSCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
datasourceTLSKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
datasourceTLSCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
"By default system CA is used")
datasourceTLSServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
"By default the server name from -datasource.url is used")
remoteWriteURL = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
remoteWriteUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
remoteWritePassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
remoteWriteMaxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
remoteWriteMaxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
remoteWriteConcurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote storage")
remoteWriteTLSInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
remoteWriteTLSCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
remoteWriteTLSKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
remoteWriteTLSCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
remoteWriteTLSServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
remoteReadURL = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
remoteReadUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
remoteReadPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadTLSInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
remoteReadTLSCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
remoteReadTLSKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
remoteReadTLSCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
"By default system CA is used")
remoteReadTLSServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
"By default the server name from -remoteRead.url is used")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
notifierTLSInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
notifierTLSCertFile = flag.String("notifier.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
notifierTLSKeyFile = flag.String("notifier.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
notifierTLSCAFile = flag.String("notifier.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
"By default system CA is used")
notifierTLSServerName = flag.String("notifier.tlsServerName", "", "Optional TLS server name to use for connections to -notifier.url. "+
"By default the server name from -notifier.url is used")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
)
func main() {
@@ -104,64 +58,24 @@ func main() {
envflag.Parse()
buildinfo.Init()
logger.Init()
checkFlags()
if *dryRun {
u, _ := url.Parse("https://victoriametrics.com/")
notifier.InitTemplateFunc(u)
groups, err := config.Parse(*rulePath, true, true)
if err != nil {
logger.Fatalf(err.Error())
}
if len(groups) == 0 {
logger.Fatalf("No rules for validation. Please specify path to file(s) with alerting and/or recording rules using `-rule` flag")
}
return
}
ctx, cancel := context.WithCancel(context.Background())
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
manager, err := newManager(ctx)
if err != nil {
logger.Fatalf("can not get external url: %s ", err)
logger.Fatalf("failed to init: %s", err)
}
notifier.InitTemplateFunc(eu)
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
if err != nil {
logger.Fatalf("URL generator error: %s", err)
}
dst, err := getTransport(datasourceURL, datasourceTLSCertFile, datasourceTLSKeyFile, datasourceTLSCAFile, datasourceTLSServerName, datasourceTLSInsecureSkipVerify)
if err != nil {
logger.Fatalf("cannot create datasource transport: %s", err)
}
nt, err := getTransport(notifierURL, notifierTLSCertFile, notifierTLSKeyFile, notifierTLSCAFile, notifierTLSServerName, notifierTLSInsecureSkipVerify)
if err != nil {
logger.Fatalf("cannot create notifier transport: %s", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{Transport: dst}),
notifier: notifier.NewAlertManager(*notifierURL, aug, &http.Client{Transport: nt}),
}
if *remoteWriteURL != "" {
t, err := getTransport(remoteWriteURL, remoteWriteTLSCertFile, remoteWriteTLSKeyFile, remoteWriteTLSCAFile, remoteWriteTLSServerName, remoteWriteTLSInsecureSkipVerify)
if err != nil {
logger.Fatalf("cannot create remoteWrite transport: %s", err)
}
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
Addr: *remoteWriteURL,
Concurrency: *remoteWriteConcurrency,
MaxQueueSize: *remoteWriteMaxQueueSize,
MaxBatchSize: *remoteWriteMaxBatchSize,
FlushInterval: *evaluationInterval,
BasicAuthUser: *remoteWriteUsername,
BasicAuthPass: *remoteWritePassword,
Transport: t,
})
if err != nil {
logger.Fatalf("failed to init remotewrite client: %s", err)
}
manager.rw = c
}
if *remoteReadURL != "" {
t, err := getTransport(remoteReadURL, remoteReadTLSCertFile, remoteReadTLSKeyFile, remoteReadTLSCAFile, remoteReadTLSServerName, remoteReadTLSInsecureSkipVerify)
if err != nil {
logger.Fatalf("cannot create remoteRead transport: %s", err)
}
manager.rr = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{Transport: t})
}
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
logger.Fatalf("failed to start: %s", err)
}
@@ -206,6 +120,56 @@ var (
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
)
func newManager(ctx context.Context) (*manager, error) {
q, err := datasource.Init()
if err != nil {
return nil, fmt.Errorf("failed to init datasource: %w", err)
}
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
}
notifier.InitTemplateFunc(eu)
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
if err != nil {
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
}
nts, err := notifier.Init(aug)
if err != nil {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
querier: q,
notifiers: nts,
labels: map[string]string{},
}
rw, err := remotewrite.Init(ctx)
if err != nil {
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
}
manager.rw = rw
rr, err := remoteread.Init()
if err != nil {
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
}
manager.rr = rr
for _, s := range *externalLabels {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
}
manager.labels[s[:n]] = s[n+1:]
}
return manager, nil
}
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
if externalURL != "" {
return url.Parse(externalURL)
@@ -235,14 +199,14 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
if err := notifier.ValidateTemplates(map[string]string{
"tpl": externalAlertSource,
}); err != nil {
return nil, fmt.Errorf("error validating source template %s:%w", externalAlertSource, err)
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
}
}
m := map[string]string{
"tpl": externalAlertSource,
}
return func(alert notifier.Alert) string {
templated, err := alert.ExecTemplate(m)
templated, err := alert.ExecTemplate(nil, m)
if err != nil {
logger.Errorf("can not exec source template %s", err)
}
@@ -250,76 +214,11 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
}, nil
}
func getTLSConfig(certFile, keyFile, CAFile, serverName *string, insecureSkipVerify *bool) (*tls.Config, error) {
var certs []tls.Certificate
if *certFile != "" {
cert, err := tls.LoadX509KeyPair(*certFile, *keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %s", *certFile, *keyFile, err)
}
certs = []tls.Certificate{cert}
}
var rootCAs *x509.CertPool
if *CAFile != "" {
pem, err := ioutil.ReadFile(*CAFile)
if err != nil {
return nil, fmt.Errorf("cannot read `ca_file` %q: %s", *CAFile, err)
}
rootCAs = x509.NewCertPool()
if !rootCAs.AppendCertsFromPEM(pem) {
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", *CAFile)
}
}
return &tls.Config{
Certificates: certs,
InsecureSkipVerify: *insecureSkipVerify,
RootCAs: rootCAs,
ServerName: *serverName,
}, nil
}
func getTransport(URL, certFile, keyFile, CAFile, serverName *string, insecureSkipVerify *bool) (*http.Transport, error) {
var u fasthttp.URI
u.Update(*URL)
var t *http.Transport
if string(u.Scheme()) == "https" {
t = http.DefaultTransport.(*http.Transport).Clone()
tlsCfg, err := getTLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
return nil, err
}
t.TLSClientConfig = tlsCfg
}
return t, nil
}
func checkFlags() {
if *notifierURL == "" {
flag.PrintDefaults()
logger.Fatalf("notifier.url is empty")
}
if *datasourceURL == "" {
flag.PrintDefaults()
logger.Fatalf("datasource.url is empty")
}
}
func usage() {
const s = `
vmalert processes alerts and recording rules.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
See the docs at https://victoriametrics.github.io/vmalert.html .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
flagutil.Usage(s)
}

View File

@@ -41,7 +41,7 @@ func TestGetAlertURLGenerator(t *testing.T) {
}
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
if err == nil {
t.Errorf("exptected tempalte validation error got nil")
t.Errorf("expected tempalte validation error got nil")
}
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
if err != nil {
@@ -51,55 +51,3 @@ func TestGetAlertURLGenerator(t *testing.T) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
}
func TestGetTLSConfig(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
serverName = "test"
insecureSkipVerify = true
tlsCfg, err := getTLSConfig(&certFile, &keyFile, &CAFile, &serverName, &insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tlsCfg == nil {
t.Errorf("expected tlsConfig to be set, got nil")
}
if tlsCfg.ServerName != serverName {
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
}
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
}
certFile = "/path/to/nonexisting/cert/file"
_, err = getTLSConfig(&certFile, &keyFile, &CAFile, &serverName, &insecureSkipVerify)
if err == nil {
t.Errorf("expected keypair error, got nil")
}
certFile = ""
CAFile = "/path/to/nonexisting/cert/file"
_, err = getTLSConfig(&certFile, &keyFile, &CAFile, &serverName, &insecureSkipVerify)
if err == nil {
t.Errorf("expected read error, got nil")
}
}
func TestGetTransport(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
URL := "http://victoriametrics.com"
tr, err := getTransport(&URL, &certFile, &keyFile, &CAFile, &serverName, &insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tr != nil {
t.Errorf("expected Transport to be nil, got %v", tr)
}
URL = "https://victoriametrics.com"
tr, err = getTransport(&URL, &certFile, &keyFile, &CAFile, &serverName, &insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tr.TLSClientConfig == nil {
t.Errorf("expected TLSClientConfig to be set, got nil")
}
}

View File

@@ -15,13 +15,14 @@ import (
// manager controls group states
type manager struct {
storage datasource.Querier
notifier notifier.Notifier
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
rr datasource.Querier
wg sync.WaitGroup
wg sync.WaitGroup
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
@@ -64,7 +65,7 @@ func (m *manager) close() {
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
@@ -73,7 +74,7 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
m.wg.Add(1)
id := group.ID()
go func() {
group.start(ctx, m.storage, m.notifier, m.rw)
group.start(ctx, m.querier, m.notifiers, m.rw)
m.wg.Done()
}()
m.groups[id] = group
@@ -83,42 +84,65 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
if err != nil {
return fmt.Errorf("cannot parse configuration file: %s", err)
return fmt.Errorf("cannot parse configuration file: %w", err)
}
groupsRegistry := make(map[uint64]*Group)
for _, cfg := range groupsCfg {
ng := newGroup(cfg, *evaluationInterval)
ng := newGroup(cfg, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
type updateItem struct {
old *Group
new *Group
}
var toUpdate []updateItem
m.groupsMu.Lock()
for _, og := range m.groups {
ng, ok := groupsRegistry[og.ID()]
if !ok {
// old group is not present in new list
// and must be stopped and deleted
// old group is not present in new list,
// so must be stopped and deleted
og.close()
delete(m.groups, og.ID())
og = nil
continue
}
og.updateCh <- ng
delete(groupsRegistry, ng.ID())
if og.Checksum != ng.Checksum {
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
}
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
}
m.groupsMu.Unlock()
if len(toUpdate) > 0 {
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
wg.Done()
}(item.old, item.new)
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as strings to avoid rounding
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.String(),
Concurrency: g.Concurrency,

View File

@@ -5,11 +5,12 @@ import (
"math/rand"
"net/url"
"os"
"strings"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
@@ -19,16 +20,15 @@ func TestMain(m *testing.M) {
os.Exit(m.Run())
}
func TestManagerUpdateError(t *testing.T) {
// TestManagerEmptyRulesDir tests
// successful cases of
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
path := []string{"foo/bar"}
err := m.update(context.Background(), path, true, true, false)
if err == nil {
t.Fatalf("expected to have err; got nil instead")
}
expErr := "no groups found"
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to got err %s; got %s", expErr, err)
if err != nil {
t.Fatalf("expected to load succesfully with empty rules dir; got err instead: %v", err)
}
}
@@ -37,9 +37,9 @@ func TestManagerUpdateError(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
storage: &fakeQuerier{},
notifier: &fakeNotifier{},
groups: make(map[uint64]*Group),
querier: &fakeQuerier{},
notifiers: []notifier.Notifier{&fakeNotifier{}},
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
@@ -108,6 +108,18 @@ func TestManagerUpdate(t *testing.T) {
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
ExampleAlertGraphite = &AlertingRule{
Name: "up graphite",
Expr: "filterSeries(time('host.1',20),'>','0')",
Type: datasource.NewGraphiteType(),
For: defaultEvalInterval,
}
ExampleAlertGraphite2 = &AlertingRule{
Name: "up",
Expr: "filterSeries(time('host.2',20),'>','0')",
Type: datasource.NewGraphiteType(),
For: defaultEvalInterval,
}
)
testCases := []struct {
@@ -124,6 +136,7 @@ func TestManagerUpdate(t *testing.T) {
{
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Type: datasource.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
@@ -148,12 +161,14 @@ func TestManagerUpdate(t *testing.T) {
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: datasource.NewPrometheusType(),
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Type: datasource.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
@@ -168,23 +183,66 @@ func TestManagerUpdate(t *testing.T) {
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: datasource.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Type: datasource.NewPrometheusType(),
Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
{
name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules0-good.rules",
want: []*Group{
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: datasource.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Type: datasource.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
},
},
},
},
{
name: "update prometheus to graphite type",
initPath: "config/testdata/dir/rules-update0-good.rules",
updatePath: "config/testdata/dir/rules-update1-good.rules",
want: []*Group{
{
File: "config/testdata/dir/rules-update1-good.rules",
Interval: defaultEvalInterval,
Type: datasource.NewGraphiteType(),
Name: "TestUpdateGroup",
Rules: []Rule{
ExampleAlertGraphite2,
ExampleAlertGraphite,
},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{groups: make(map[uint64]*Group), storage: &fakeQuerier{}}
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
path := []string{tc.initPath}
if err := m.update(ctx, path, true, true, false); err != nil {
t.Fatalf("failed to complete initial rules update: %s", err)

39
app/vmalert/metrics.go Normal file
View File

@@ -0,0 +1,39 @@
package main
import "github.com/VictoriaMetrics/metrics"
type gauge struct {
name string
*metrics.Gauge
}
func getOrCreateGauge(name string, f func() float64) *gauge {
return &gauge{
name: name,
Gauge: metrics.GetOrCreateGauge(name, f),
}
}
type counter struct {
name string
*metrics.Counter
}
func getOrCreateCounter(name string) *counter {
return &counter{
name: name,
Counter: metrics.GetOrCreateCounter(name),
}
}
type summary struct {
name string
*metrics.Summary
}
func getOrCreateSummary(name string) *summary {
return &summary{
name: name,
Summary: metrics.GetOrCreateSummary(name),
}
}

View File

@@ -7,6 +7,8 @@ import (
"strings"
"text/template"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
// Alert the triggered alert
@@ -50,7 +52,8 @@ func (as AlertState) String() string {
return "inactive"
}
type alertTplData struct {
// AlertTplData is used to execute templating
type AlertTplData struct {
Labels map[string]string
Value float64
Expr string
@@ -58,45 +61,53 @@ type alertTplData struct {
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
// ExecTemplate executes the Alert template for give
// ExecTemplate executes the Alert template for given
// map of annotations.
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
return templateAnnotations(annotations, tplHeader, tplData)
// Every alert could have a different datasource, so function
// requires a queryFunction as an argument.
func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) {
tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
return templateAnnotations(annotations, tplData, funcsWithQuery(q))
}
// ExecTemplate executes the given template for given annotations map.
func ExecTemplate(q QueryFn, annotations map[string]string, tpl AlertTplData) (map[string]string, error) {
return templateAnnotations(annotations, tpl, funcsWithQuery(q))
}
// ValidateTemplates validate annotations for possible template error, uses empty data for template population
func ValidateTemplates(annotations map[string]string) error {
_, err := templateAnnotations(annotations, tplHeader, alertTplData{
_, err := templateAnnotations(annotations, AlertTplData{
Labels: map[string]string{},
Value: 0,
})
}, tmplFunc)
return err
}
func templateAnnotations(annotations map[string]string, header string, data alertTplData) (map[string]string, error) {
func templateAnnotations(annotations map[string]string, data AlertTplData, funcs template.FuncMap) (map[string]string, error) {
var builder strings.Builder
var buf bytes.Buffer
eg := errGroup{}
eg := new(utils.ErrGroup)
r := make(map[string]string, len(annotations))
for key, text := range annotations {
r[key] = text
buf.Reset()
builder.Reset()
builder.Grow(len(header) + len(text))
builder.WriteString(header)
builder.Grow(len(tplHeader) + len(text))
builder.WriteString(tplHeader)
builder.WriteString(text)
if err := templateAnnotation(&buf, builder.String(), data); err != nil {
eg.errs = append(eg.errs, fmt.Sprintf("key %q, template %q: %s", key, text, err))
if err := templateAnnotation(&buf, builder.String(), data, funcs); err != nil {
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
continue
}
r[key] = buf.String()
}
return r, eg.err()
return r, eg.Err()
}
func templateAnnotation(dst io.Writer, text string, data alertTplData) error {
tpl, err := template.New("").Funcs(tmplFunc).Option("missingkey=zero").Parse(text)
func templateAnnotation(dst io.Writer, text string, data AlertTplData, funcs template.FuncMap) error {
t := template.New("").Funcs(funcs).Option("missingkey=zero")
tpl, err := t.Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation: %w", err)
}

View File

@@ -2,6 +2,8 @@ package notifier
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
func TestAlert_ExecTemplate(t *testing.T) {
@@ -60,11 +62,41 @@ func TestAlert_ExecTemplate(t *testing.T) {
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
},
},
{
name: "query",
alert: &Alert{Expr: `vm_rows{"label"="bar"}>0`},
annotations: map[string]string{
"summary": `{{ query "foo" | first | value }}`,
"desc": `{{ range query "bar" }}{{ . | label "foo" }} {{ . | value }};{{ end }}`,
},
expTpl: map[string]string{
"summary": "1",
"desc": "bar 1;garply 2;",
},
},
}
qFn := func(q string) ([]datasource.Metric, error) {
return []datasource.Metric{
{
Labels: []datasource.Label{
{Name: "foo", Value: "bar"},
{Name: "baz", Value: "qux"},
},
Value: 1,
},
{
Labels: []datasource.Label{
{Name: "foo", Value: "garply"},
{Name: "baz", Value: "fred"},
},
Value: 2,
},
}, nil
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tpl, err := tc.alert.ExecTemplate(tc.annotations)
tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations)
if err != nil {
t.Fatal(err)
}

View File

@@ -12,9 +12,11 @@ import (
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
alertURL string
argFunc AlertURLGenerator
client *http.Client
alertURL string
basicAuthUser string
basicAuthPass string
argFunc AlertURLGenerator
client *http.Client
}
// Send an alert or resolve message
@@ -26,8 +28,11 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Content-Type", "application/json; charset=utf-8")
req = req.WithContext(ctx)
if am.basicAuthPass != "" {
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
}
resp, err := am.client.Do(req)
if err != nil {
return err
@@ -38,7 +43,7 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
if resp.StatusCode != http.StatusOK {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response from %q: %s", am.alertURL, err)
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
}
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
}
@@ -51,10 +56,13 @@ type AlertURLGenerator func(Alert) string
const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL string, fn AlertURLGenerator, c *http.Client) *AlertManager {
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
return &AlertManager{
alertURL: strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath,
argFunc: fn,
client: c,
alertURL: addr,
argFunc: fn,
client: c,
basicAuthUser: user,
basicAuthPass: pass,
}
}

View File

@@ -11,12 +11,21 @@ import (
)
func TestAlertManager_Send(t *testing.T) {
const baUser, baPass = "foo", "bar"
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
user, pass, ok := r.BasicAuth()
if !ok {
t.Errorf("unauthorized request")
}
if user != baUser || pass != baPass {
t.Errorf("wrong creds %q:%q; expected %q:%q",
user, pass, baUser, baPass)
}
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
@@ -43,22 +52,22 @@ func TestAlertManager_Send(t *testing.T) {
t.Errorf("expected 1 alert in array got %d", len(a))
}
if a[0].GeneratorURL != "0/0" {
t.Errorf("exptected 0/0 as generatorURL got %s", a[0].GeneratorURL)
t.Errorf("expected 0/0 as generatorURL got %s", a[0].GeneratorURL)
}
if a[0].Labels["alertname"] != "alert0" {
t.Errorf("exptected alert0 as alert name got %s", a[0].Labels["alertname"])
t.Errorf("expected alert0 as alert name got %s", a[0].Labels["alertname"])
}
if a[0].StartsAt.IsZero() {
t.Errorf("exptected non-zero start time")
t.Errorf("expected non-zero start time")
}
if a[0].EndAt.IsZero() {
t.Errorf("exptected non-zero end time")
t.Errorf("expected non-zero end time")
}
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewAlertManager(srv.URL, func(alert Alert) string {
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
}, srv.Client())
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {

View File

@@ -0,0 +1,45 @@
package notifier
import (
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
)
var (
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -notifier.url")
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -notifier.url")
tlsInsecureSkipVerify = flagutil.NewArrayBool("notifier.tlsInsecureSkipVerify", "Whether to skip tls verification when connecting to -notifier.url")
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
"By default system CA is used")
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
"By default the server name from -notifier.url is used")
)
// Init creates a Notifier object based on provided flags.
func Init(gen AlertURLGenerator) ([]Notifier, error) {
if len(*addrs) == 0 {
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
}
var notifiers []Notifier
for i, addr := range *addrs {
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
tr, err := utils.Transport(addr, cert, key, ca, serverName, tlsInsecureSkipVerify.GetOptionalArg(i))
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
notifiers = append(notifiers, am)
}
return notifiers, nil
}

View File

@@ -14,21 +14,40 @@
package notifier
import (
"errors"
"fmt"
html_template "html/template"
"math"
"net/url"
"regexp"
"strings"
text_template "text/template"
"time"
htmlTpl "html/template"
textTpl "text/template"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
var tmplFunc text_template.FuncMap
// QueryFn is used to wrap a call to datasource into simple-to-use function
// for templating functions.
type QueryFn func(query string) ([]datasource.Metric, error)
// InitTemplateFunc returns template helper functions
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
fm := make(textTpl.FuncMap)
for k, fn := range tmplFunc {
fm[k] = fn
}
fm["query"] = func(q string) ([]datasource.Metric, error) {
return query(q)
}
return fm
}
var tmplFunc textTpl.FuncMap
// InitTemplateFunc initiates template helper functions
func InitTemplateFunc(externalURL *url.URL) {
tmplFunc = text_template.FuncMap{
tmplFunc = textTpl.FuncMap{
"args": func(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
@@ -40,8 +59,8 @@ func InitTemplateFunc(externalURL *url.URL) {
re := regexp.MustCompile(pattern)
return re.ReplaceAllString(text, repl)
},
"safeHtml": func(text string) html_template.HTML {
return html_template.HTML(text)
"safeHtml": func(text string) htmlTpl.HTML {
return htmlTpl.HTML(text)
},
"match": regexp.MatchString,
"title": strings.Title,
@@ -148,9 +167,33 @@ func InitTemplateFunc(externalURL *url.URL) {
"queryEscape": func(q string) string {
return url.QueryEscape(q)
},
"crlfEscape": func(q string) string {
q = strings.Replace(q, "\n", `\n`, -1)
return strings.Replace(q, "\r", `\r`, -1)
},
"quotesEscape": func(q string) string {
return strings.Replace(q, `"`, `\"`, -1)
},
// query function supposed to be substituted at funcsWithQuery().
// it is present here only for validation purposes, when there is no
// provided datasource.
"query": func(q string) ([]datasource.Metric, error) {
// return non-empty slice to pass validation with chained functions in template
// see issue #989 for details
return []datasource.Metric{{}}, nil
},
"first": func(metrics []datasource.Metric) (datasource.Metric, error) {
if len(metrics) > 0 {
return metrics[0], nil
}
return datasource.Metric{}, errors.New("first() called on vector with no elements")
},
"label": func(label string, m datasource.Metric) string {
return m.Label(label)
},
"value": func(m datasource.Metric) float64 {
return m.Value
},
}
}

View File

@@ -1,21 +0,0 @@
package notifier
import (
"fmt"
"strings"
)
type errGroup struct {
errs []string
}
func (eg *errGroup) err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
func (eg *errGroup) Error() string {
return fmt.Sprintf("errors: %s", strings.Join(eg.errs, "\n"))
}

View File

@@ -2,7 +2,6 @@ package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
@@ -12,12 +11,14 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
// RecordingRule is a Rule that supposed
// to evaluate configured Expression and
// return TimeSeries as result.
type RecordingRule struct {
Type datasource.Type
RuleID uint64
Name string
Expr string
@@ -32,6 +33,12 @@ type RecordingRule struct {
// resets on every successful Exec
// may be used as Health state
lastExecError error
metrics *recordingRuleMetrics
}
type recordingRuleMetrics struct {
errors *gauge
}
// String implements Stringer interface
@@ -45,17 +52,34 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(gID uint64, cfg config.Rule) *RecordingRule {
return &RecordingRule{
func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
Type: cfg.Type,
RuleID: cfg.ID,
Name: cfg.Record,
Expr: cfg.Expr,
Labels: cfg.Labels,
GroupID: gID,
GroupID: group.ID(),
metrics: &recordingRuleMetrics{},
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
rr.mu.Lock()
defer rr.mu.Unlock()
if rr.lastExecError == nil {
return 0
}
return 1
})
return rr
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
metrics.UnregisterMetric(rr.metrics.errors.name)
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
@@ -63,15 +87,14 @@ func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series
return nil, nil
}
qMetrics, err := q.Query(ctx, rr.Expr)
qMetrics, err := q.Query(ctx, rr.Expr, rr.Type)
rr.mu.Lock()
defer rr.mu.Unlock()
rr.lastExecTime = time.Now()
rr.lastExecError = err
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %s", rr.Expr, err)
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
}
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
@@ -141,6 +164,7 @@ func (rr *RecordingRule) RuleAPI() APIRecordingRule {
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
Name: rr.Name,
Type: rr.Type.String(),
Expression: rr.Expr,
LastError: lastErr,
LastExec: rr.lastExecTime,

View File

@@ -0,0 +1,39 @@
package remoteread
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
"By default the server name from -remoteRead.url is used")
)
// Init creates a Querier from provided flag values.
// Returns nil if addr flag wasn't set.
func Init() (datasource.Querier, error) {
if *addr == "" {
return nil, nil
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &http.Client{Transport: tr}
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, 0, 0, c), nil
}

View File

@@ -0,0 +1,54 @@
package remotewrite
import (
"context"
"flag"
"fmt"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
)
// Init creates Client object from given flags.
// Returns nil if addr flag wasn't set.
func Init(ctx context.Context) (*Client, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
return NewClient(ctx, Config{
Addr: *addr,
Concurrency: *concurrency,
MaxQueueSize: *maxQueueSize,
MaxBatchSize: *maxBatchSize,
FlushInterval: *flushInterval,
BasicAuthUser: *basicAuthUsername,
BasicAuthPass: *basicAuthPassword,
Transport: t,
})
}

View File

@@ -12,6 +12,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
"github.com/golang/snappy"
)
@@ -61,7 +62,7 @@ const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = time.Second
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
@@ -85,6 +86,9 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.WriteTimeout == 0 {
cfg.WriteTimeout = defaultWriteTimeout
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
c := &Client{
c: &http.Client{
Timeout: cfg.WriteTimeout,
@@ -95,6 +99,7 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
baPass: cfg.BasicAuthPass,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
@@ -137,14 +142,11 @@ func (c *Client) Close() error {
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := prompbmarshal.WriteRequest{}
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
if len(wr.Timeseries) < 1 {
return
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
c.flush(lastCtx, wr)
cancel()
@@ -163,44 +165,82 @@ func (c *Client) run(ctx context.Context) {
return
case <-ticker.C:
c.flush(ctx, wr)
wr = prompbmarshal.WriteRequest{}
case ts := <-c.input:
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
wr = prompbmarshal.WriteRequest{}
}
}
}
}()
}
func (c *Client) flush(ctx context.Context, wr prompbmarshal.WriteRequest) {
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
req, err := http.NewRequest("POST", c.addr, bytes.NewReader(snappy.Encode(nil, data)))
const attempts = 5
b := snappy.Encode(nil, data)
for i := 0; i < attempts; i++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
logger.Errorf("attempt %d to send request failed: %s", i+1, err)
// sleeping to avoid remote db hammering
time.Sleep(time.Second)
continue
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("all %d attempts to send request failed - dropping %d timeseries",
attempts, len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest("POST", c.addr, r)
if err != nil {
logger.Errorf("failed to create new HTTP request: %s", err)
return
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
if c.baPass != "" {
req.SetBasicAuth(c.baUser, c.baPass)
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
logger.Errorf("error getting response from %s:%s", req.URL, err)
return
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL, err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusNoContent {
body, _ := ioutil.ReadAll(resp.Body)
logger.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
return
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL, body)
}
return nil
}

View File

@@ -0,0 +1,102 @@
package remotewrite
import (
"context"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestClient_Push(t *testing.T) {
testSrv := newRWServer()
cfg := Config{
Addr: testSrv.URL,
MaxBatchSize: 100,
}
client, err := NewClient(context.Background(), cfg)
if err != nil {
t.Fatalf("failed to create client: %s", err)
}
const rowsN = 1e4
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: rand.Float64(),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}
func newRWServer() *rwServer {
rw := &rwServer{}
rw.Server = httptest.NewServer(http.HandlerFunc(rw.handler))
return rw
}
type rwServer struct {
// WARN: ordering of fields is important for alignment!
// see https://golang.org/pkg/sync/atomic/#pkg-note-BUG
acceptedRows uint64
*httptest.Server
}
func (rw *rwServer) accepted() int {
return int(atomic.LoadUint64(&rw.acceptedRows))
}
func (rw *rwServer) err(w http.ResponseWriter, err error) {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(err.Error()))
}
func (rw *rwServer) handler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
rw.err(w, fmt.Errorf("bad method %q", r.Method))
return
}
data, err := ioutil.ReadAll(r.Body)
if err != nil {
rw.err(w, fmt.Errorf("body read err: %w", err))
return
}
defer func() { _ = r.Body.Close() }()
b, err := snappy.Decode(nil, data)
if err != nil {
rw.err(w, fmt.Errorf("decode err: %w", err))
return
}
wr := &prompb.WriteRequest{}
if err := wr.Unmarshal(b); err != nil {
rw.err(w, fmt.Errorf("unmarhsal err: %w", err))
return
}
atomic.AddUint64(&rw.acceptedRows, uint64(len(wr.Timeseries)))
w.WriteHeader(http.StatusNoContent)
}

View File

@@ -2,6 +2,7 @@ package main
import (
"context"
"errors"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@@ -21,4 +22,9 @@ type Rule interface {
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")

View File

@@ -1,9 +1,10 @@
package main
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"sort"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {

View File

@@ -0,0 +1,43 @@
package utils
import (
"fmt"
"strings"
)
// ErrGroup accumulates multiple errors
// and produces single error message.
type ErrGroup struct {
errs []error
}
// Add adds a new error to group.
// Isn't thread-safe.
func (eg *ErrGroup) Add(err error) {
eg.errs = append(eg.errs, err)
}
// Err checks if group contains at least
// one error.
func (eg *ErrGroup) Err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
// Error satisfies Error interface
func (eg *ErrGroup) Error() string {
if len(eg.errs) == 0 {
return ""
}
var b strings.Builder
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
for i, err := range eg.errs {
b.WriteString(err.Error())
if i != len(eg.errs)-1 {
b.WriteString("\n")
}
}
return b.String()
}

View File

@@ -0,0 +1,38 @@
package utils
import (
"errors"
"testing"
)
func TestErrGroup(t *testing.T) {
testCases := []struct {
errs []error
exp string
}{
{nil, ""},
{[]error{errors.New("timeout")}, "errors(1): timeout"},
{
[]error{errors.New("timeout"), errors.New("deadline")},
"errors(2): timeout\ndeadline",
},
}
for _, tc := range testCases {
eg := new(ErrGroup)
for _, err := range tc.errs {
eg.Add(err)
}
if len(tc.errs) == 0 {
if eg.Err() != nil {
t.Fatalf("expected to get nil error")
}
continue
}
if eg.Err() == nil {
t.Fatalf("expected to get non-nil error")
}
if eg.Error() != tc.exp {
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
}
}
}

58
app/vmalert/utils/tls.go Normal file
View File

@@ -0,0 +1,58 @@
package utils
import (
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
"net/http"
"strings"
)
// Transport creates http.Transport object based on provided URL.
// Returns Transport with TLS configuration if URL contains `https` prefix
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
t := http.DefaultTransport.(*http.Transport).Clone()
if !strings.HasPrefix(URL, "https") {
return t, nil
}
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
return nil, err
}
t.TLSClientConfig = tlsCfg
return t, nil
}
// TLSConfig creates tls.Config object from provided arguments
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
var certs []tls.Certificate
if certFile != "" {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
}
certs = []tls.Certificate{cert}
}
var rootCAs *x509.CertPool
if CAFile != "" {
pem, err := ioutil.ReadFile(CAFile)
if err != nil {
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
}
rootCAs = x509.NewCertPool()
if !rootCAs.AppendCertsFromPEM(pem) {
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
}
}
return &tls.Config{
Certificates: certs,
InsecureSkipVerify: insecureSkipVerify,
RootCAs: rootCAs,
ServerName: serverName,
}, nil
}

View File

@@ -0,0 +1,52 @@
package utils
import "testing"
func TestTLSConfig(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
serverName = "test"
insecureSkipVerify = true
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tlsCfg == nil {
t.Errorf("expected tlsConfig to be set, got nil")
}
if tlsCfg.ServerName != serverName {
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
}
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
}
certFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected keypair error, got nil")
}
certFile = ""
CAFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected read error, got nil")
}
}
func TestTransport(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
URL := "http://victoriametrics.com"
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
URL = "https://victoriametrics.com"
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tr.TLSClientConfig == nil {
t.Errorf("expected TLSClientConfig to be set, got nil")
}
}

View File

@@ -27,7 +27,6 @@ var pathList = [][]string{
}
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
resph := responseHandler{w}
switch r.URL.Path {
case "/":
for _, path := range pathList {
@@ -36,10 +35,22 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
}
return true
case "/api/v1/groups":
resph.handle(rh.listGroups())
data, err := rh.listGroups()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
case "/api/v1/alerts":
resph.handle(rh.listAlerts())
data, err := rh.listAlerts()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
case "/-/reload":
logger.Infof("api config reload was called, sending sighup")
@@ -47,12 +58,18 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
w.WriteHeader(http.StatusOK)
return true
default:
if !strings.HasSuffix(r.URL.Path, "/status") {
return false
}
// /api/v1/<groupName>/<alertID>/status
if strings.HasSuffix(r.URL.Path, "/status") {
resph.handle(rh.alert(r.URL.Path))
data, err := rh.alert(r.URL.Path)
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
return false
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
}
}
@@ -80,7 +97,7 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
@@ -117,7 +134,7 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %s`, err),
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
@@ -138,11 +155,11 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
groupID, err := uint64FromPath(parts[0])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %s`, err))
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
}
alertID, err := uint64FromPath(parts[1])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %s`, err))
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
}
resp, err := rh.m.AlertAPI(groupID, alertID)
if err != nil {
@@ -151,18 +168,6 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
return json.Marshal(resp)
}
// responseHandler wrapper on http.ResponseWriter with sugar
type responseHandler struct{ http.ResponseWriter }
func (w responseHandler) handle(b []byte, err error) {
if err != nil {
httpserver.Errorf(w, "%s", err)
return
}
w.Header().Set("Content-Type", "application/json")
w.Write(b)
}
func uint64FromPath(path string) (uint64, error) {
s := strings.TrimRight(path, "/")
return strconv.ParseUint(s, 10, 0)

View File

@@ -21,6 +21,7 @@ type APIAlert struct {
// APIGroup represents Group for WEB view
type APIGroup struct {
Name string `json:"name"`
Type string `json:"type"`
ID string `json:"id"`
File string `json:"file"`
Interval string `json:"interval"`
@@ -33,6 +34,7 @@ type APIGroup struct {
type APIAlertingRule struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
For string `json:"for"`
@@ -46,6 +48,7 @@ type APIAlertingRule struct {
type APIRecordingRule struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
LastError string `json:"last_error"`

View File

@@ -58,19 +58,22 @@ run-vmauth:
$(MAKE) run-via-docker
vmauth-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-amd64 ./app/vmauth
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmauth-local-with-goarch
vmauth-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm ./app/vmauth
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmauth-local-with-goarch
vmauth-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-arm64 ./app/vmauth
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmauth-local-with-goarch
vmauth-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-ppc64le ./app/vmauth
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmauth-local-with-goarch
vmauth-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmauth-386 ./app/vmauth
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmauth-local-with-goarch
vmauth-local-with-goarch:
APP_NAME=vmauth $(MAKE) app-local-with-goarch
vmauth-pure:
APP_NAME=vmauth $(MAKE) app-local-pure

View File

@@ -5,7 +5,7 @@ It reads username and password from [Basic Auth headers](https://en.wikipedia.or
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
### Quick start
## Quick start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
@@ -26,7 +26,7 @@ Pass `-help` to `vmauth` in order to see all the supported command-line flags wi
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
### Auth config
## Auth config
Auth config is represented in the following simple `yml` format:
@@ -46,7 +46,7 @@ users:
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
@@ -55,7 +55,7 @@ users:
url_prefix: "http://vmselect:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
@@ -64,8 +64,11 @@ users:
url_prefix: "http://vminsert:8480/insert/42/prometheus"
```
The config may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
This may be useful for passing secrets to the config.
### Security
## Security
Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable https. This can be done by passing the following `-tls*` command-line flags to `vmauth`:
@@ -81,44 +84,44 @@ Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
### Monitoring
## Monitoring
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
either via [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or via Prometheus, so the exported metrics could be analyzed later.
either via [vmagent](https://victoriametrics.github.io/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
### How to build from sources
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
#### Development build
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmauth` from the root folder of the repository.
It builds `vmauth` binary and puts it into the `bin` folder.
#### Production build
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmauth-prod` from the root folder of the repository.
It builds `vmauth-prod` binary and puts it into the `bin` folder.
#### Building docker images
### Building docker images
Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmauth
ROOT_IMAGE=scratch make package-vmauth
```
### Profiling
## Profiling
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
@@ -137,3 +140,68 @@ curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
The command for collecting CPU profile waits for 30 seconds before returning.
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
## Advanced usage
Pass `-help` command-line arg to `vmauth` in order to see all the configuration options:
```
./vmauth -help
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmauth.html .
-auth.config string
Path to auth config. See https://victoriametrics.github.io/vmauth.html for details on the format of this auth config
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
TCP address to listen for http connections (default ":8427")
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit (default 10)
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
```

View File

@@ -9,6 +9,7 @@ import (
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
@@ -16,7 +17,7 @@ import (
)
var (
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md "+
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://victoriametrics.github.io/vmauth.html "+
"for details on the format of this auth config")
)
@@ -82,20 +83,21 @@ var stopCh chan struct{}
func readAuthConfig(path string) (map[string]*UserInfo, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("cannot read %q: %s", path, err)
return nil, fmt.Errorf("cannot read %q: %w", path, err)
}
m, err := parseAuthConfig(data)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %s", path, err)
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
}
logger.Infof("Loaded information about %d users from %q", len(m), path)
return m, nil
}
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
data = envtemplate.Replace(data)
var ac AuthConfig
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %s", err)
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
}
uis := ac.Users
if len(uis) == 0 {
@@ -115,7 +117,7 @@ func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
// Validate urlPrefix
target, err := url.Parse(urlPrefix)
if err != nil {
return nil, fmt.Errorf("invalid `url_prefix: %q`: %s", urlPrefix, err)
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
}
if target.Scheme != "http" && target.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)

View File

@@ -12,7 +12,7 @@ users:
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
@@ -21,7 +21,7 @@ users:
url_prefix: "http://vmselect:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md#url-format
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write

View File

@@ -2,7 +2,6 @@ package main
import (
"flag"
"fmt"
"net/http"
"net/http/httputil"
"net/url"
@@ -11,6 +10,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
@@ -49,20 +49,21 @@ func main() {
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
username, password, ok := r.BasicAuth()
if !ok {
httpserver.Errorf(w, "Missing `Authorization: Basic *` header")
w.Header().Set("WWW-Authenticate", `Basic realm="Restricted"`)
http.Error(w, "missing `Authorization: Basic *` header", http.StatusUnauthorized)
return true
}
ac := authConfig.Load().(map[string]*UserInfo)
info := ac[username]
if info == nil || info.Password != password {
httpserver.Errorf(w, "Cannot find the provided username %q or password in config", username)
httpserver.Errorf(w, r, "cannot find the provided username %q or password in config", username)
return true
}
info.requests.Inc()
targetURL := createTargetURL(info.URLPrefix, r.URL)
if _, err := url.Parse(targetURL); err != nil {
httpserver.Errorf(w, "Invalid targetURL=%q: %s", targetURL, err)
httpserver.Errorf(w, r, "invalid targetURL=%q: %s", targetURL, err)
return true
}
r.Header.Set("vm-target-url", targetURL)
@@ -95,10 +96,7 @@ func usage() {
const s = `
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md .
See the docs at https://victoriametrics.github.io/vmauth.html .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
flagutil.Usage(s)
}

View File

@@ -51,20 +51,23 @@ package-vmbackup-386:
publish-vmbackup:
APP_NAME=vmbackup $(MAKE) publish-via-docker
vmbackup-pure:
APP_NAME=vmbackup $(MAKE) app-local-pure
vmbackup-amd64:
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-amd64 ./app/vmbackup
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmbackup-local-with-goarch
vmbackup-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-arm ./app/vmbackup
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmbackup-local-with-goarch
vmbackup-arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-arm64 ./app/vmbackup
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmbackup-local-with-goarch
vmbackup-ppc64le:
CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-ppc64le ./app/vmbackup
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmbackup-local-with-goarch
vmbackup-386:
CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-386 ./app/vmbackup
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmbackup-local-with-goarch
vmbackup-local-with-goarch:
APP_NAME=vmbackup $(MAKE) app-local-with-goarch
vmbackup-pure:
APP_NAME=vmbackup $(MAKE) app-local-pure

View File

@@ -1,21 +1,21 @@
## vmbackup
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
Supported storage systems for backups:
* [GCS](https://cloud.google.com/storage/). Example: `gcs://<bucket>/<path/to/backup>`
* [S3](https://aws.amazon.com/s3/). Example: `s3://<bucket>/<path/to/backup>`
* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio), [Ceph](https://docs.ceph.com/docs/mimic/radosgw/s3/) or [Swift](https://www.swiftstack.com/docs/admin/middleware/s3_middleware.html). See `-customS3Endpoint` command-line flag.
* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio), [Ceph](https://docs.ceph.com/docs/mimic/radosgw/s3/) or [Swift](https://www.swiftstack.com/docs/admin/middleware/s3_middleware.html). See [these docs](#advanced-usage) for details.
* Local filesystem. Example: `fs://</absolute/path/to/backup>`
Incremental backups and full backups are supported. Incremental backups are created automatically if the destination path already contains data from the previous backup.
`vmbackup` supports incremental and full backups. Incremental backups created automatically if the destination path already contains data from the previous backup.
Full backups can be sped up with `-origin` pointing to already existing backup on the same remote storage. In this case `vmbackup` makes server-side copy for the shared
data between the existing backup and new backup. This saves time and costs on data transfer.
data between the existing backup and new backup. It saves time and costs on data transfer.
Backup process can be interrupted at any time. It is automatically resumed from the interruption point when restarting `vmbackup` with the same args.
Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md).
Backed up data can be restored with [vmrestore](https://victoriametrics.github.io/vmrestore.html).
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
@@ -23,9 +23,9 @@ See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/
creation of hourly, daily, weekly and monthly backups.
### Use cases
## Use cases
#### Regular backups
### Regular backups
Regular backup can be performed with the following command:
@@ -34,13 +34,13 @@ vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-
```
* `</path/to/victoria-metrics-data>` - path to VictoriaMetrics data pointed by `-storageDataPath` command-line flag in single-node VictoriaMetrics or in cluster `vmstorage`.
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
* `<local-snapshot>` is the snapshot to backup. See [how to create instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
* `<bucket>` is already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<bucket>` is an already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
* `<path/to/new/backup>` is the destination path where new backup will be placed.
#### Regular backups with server-side copy from existing backup
### Regular backups with server-side copy from existing backup
If the destination GCS bucket already contains the previous backup at `-origin` path, then new backup can be sped up
with the following command:
@@ -49,20 +49,20 @@ with the following command:
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/new/backup> -origin=gcs://<bucket>/<path/to/existing/backup>
```
This saves time and network bandwidth costs by performing server-side copy for the shared data from the `-origin` to `-dst`.
It saves time and network bandwidth costs by performing server-side copy for the shared data from the `-origin` to `-dst`.
#### Incremental backups
### Incremental backups
Incremental backups are performed if `-dst` points to already existing backup. In this case only new data is uploaded to remote storage.
This saves time and network bandwidth costs when working with big backups:
Incremental backups performed if `-dst` points to an already existing backup. In this case only new data uploaded to remote storage.
It saves time and network bandwidth costs when working with big backups:
```
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/existing/backup>
```
#### Smart backups
### Smart backups
Smart backups mean storing full daily backups into `YYYYMMDD` folders and creating incremental hourly backup into `latest` folder:
@@ -72,7 +72,7 @@ Smart backups mean storing full daily backups into `YYYYMMDD` folders and creati
vmbackup -snapshotName=<latest-snapshot> -dst=gcs://<bucket>/latest
```
Where `<latest-snapshot>` is the latest [snapshot](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
Where `<latest-snapshot>` is the latest [snapshot](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
The command will upload only changed data to `gcs://<bucket>/latest`.
* Run the following command once a day:
@@ -92,7 +92,7 @@ Do not forget removing old snapshots and backups when they are no longer needed
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
### How does it work?
## How does it work?
The backup algorithm is the following:
@@ -100,16 +100,16 @@ The backup algorithm is the following:
2. Determine files in `-dst`, which are missing in `-snapshotName`, and delete them. These are usually small files, which are already merged into bigger files in the snapshot.
3. Determine files from `-snapshotName`, which are missing in `-dst`. These are usually small new files and bigger merged files.
4. Determine files from step 3, which exist in the `-origin`, and perform server-side copy of these files from `-origin` to `-dst`.
This are usually the biggest and the oldest files, which are shared between backups.
5. Upload the remaining files from setp 3 from `-snapshotName` to `-dst`.
These are usually the biggest and the oldest files, which are shared between backups.
5. Upload the remaining files from step 3 from `-snapshotName` to `-dst`.
The algorithm splits source files into 100MB chunks in the backup. Each chunk is stored as a separate file in the backup.
The algorithm splits source files into 100 MB chunks in the backup. Each chunk stored as a separate file in the backup.
Such splitting minimizes the amounts of data to re-transfer after temporary errors.
`vmbackup` relies on [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282) properties:
- All the files in the snapshot are immutable.
- Old files are periodically merged into new files.
- Old files periodically merged into new files.
- Smaller files have higher probability to be merged.
- Consecutive snapshots share many identical files.
@@ -118,18 +118,56 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-
`vmbackup` can work improperly or slowly when these properties are violated.
### Troubleshooting
## Troubleshooting
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
* Backups created from [single-node VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md) cannot be restored
at [cluster VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md) and vice versa.
* Backups created from [single-node VictoriaMetrics](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html) cannot be restored
at [cluster VictoriaMetrics](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html) and vice versa.
### Advanced usage
## Advanced usage
Run `vmbackup -help` in order to see all the available options:
* Obtaining credentials from a file.
Add flag `-credsFilePath=/etc/credentials` with the following content:
for s3 (aws, minio or other s3 compatible storages):
```bash
[default]
aws_access_key_id=theaccesskey
aws_secret_access_key=thesecretaccesskeyvalue
```
for gce cloud storage:
```json
{
"type": "service_account",
"project_id": "project-id",
"private_key_id": "key-id",
"private_key": "-----BEGIN PRIVATE KEY-----\nprivate-key\n-----END PRIVATE KEY-----\n",
"client_email": "service-account-email",
"client_id": "client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/service-account-email"
}
```
* Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc.
You have to add a custom url endpoint via flag:
```
# for minio
-customS3Endpoint=http://localhost:9000
# for aws gov region
-customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com
```
* Run `vmbackup -help` in order to see all the available options:
```
-concurrency int
@@ -138,7 +176,7 @@ Run `vmbackup -help` in order to see all the available options:
Path to file with S3 configs. Configs are loaded from default location if not set.
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
-configProfile string
Profile name for S3 configs (default "default")
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
-credsFilePath string
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
@@ -152,25 +190,31 @@ Run `vmbackup -help` in order to see all the available options:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit (default 10)
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-maxBytesPerSecond int
-maxBytesPerSecond value
The maximum upload speed. There is no limit if it is set to 0
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-origin string
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup.Example: http://victoriametrics:8428/snaphsot/create
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
-snapshot.deleteURL string
VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted.Example: http://victoriametrics:8428/snaphsot/delete
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete
-snapshotName string
Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots
Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots
-storageDataPath string
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
-version
@@ -178,32 +222,32 @@ Run `vmbackup -help` in order to see all the available options:
```
### How to build from sources
## How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
#### Development build
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmbackup` from the root folder of the repository.
It builds `vmbackup` binary and puts it into the `bin` folder.
#### Production build
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmbackup-prod` from the root folder of the repository.
It builds `vmbackup-prod` binary and puts it into the `bin` folder.
#### Building docker images
### Building docker images
Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
By default the image is built on top of `scratch` image. It is possible to build the package on top of any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of `alpine:3.11` image:
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=alpine:3.11 make package-vmbackup
ROOT_IMAGE=scratch make package-vmbackup
```

View File

@@ -10,24 +10,26 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fsnil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
var (
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup."+
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots")
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. "+
"Example: http://victoriametrics:8428/snaphsot/create")
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from snapshotCreateURL if not provided. All created snaphosts will be automatically deleted."+
"Example: http://victoriametrics:8428/snaphsot/delete")
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. "+
"All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete")
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
origin = flag.String("origin", "", "Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups")
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce backup duration")
maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum upload speed. There is no limit if it is set to 0")
maxBytesPerSecond = flagutil.NewBytes("maxBytesPerSecond", 0, "The maximum upload speed. There is no limit if it is set to 0")
)
func main() {
@@ -36,9 +38,10 @@ func main() {
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
if len(*snapshotCreateURL) > 0 {
logger.Infof("%s", "Snapshots enabled")
logger.Infof("Snapshots enabled")
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
if len(*snapshotDeleteURL) <= 0 {
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
@@ -50,17 +53,17 @@ func main() {
name, err := snapshot.Create(*snapshotCreateURL)
if err != nil {
logger.Fatalf("%s", err)
logger.Fatalf("cannot create snapshot: %s", err)
}
err = flag.Set("snapshotName", name)
if err != nil {
logger.Fatalf("Failed to set snapshotName flag: %v", err)
logger.Fatalf("cannot set snapshotName flag: %v", err)
}
defer func() {
err := snapshot.Delete(*snapshotDeleteURL, name)
if err != nil {
logger.Fatalf("%s", err)
logger.Fatalf("cannot delete snapshot: %s", err)
}
}()
}
@@ -86,6 +89,9 @@ func main() {
if err := a.Run(); err != nil {
logger.Fatalf("cannot create backup: %s", err)
}
srcFS.MustStop()
dstFS.MustStop()
originFS.MustStop()
}
func usage() {
@@ -93,12 +99,9 @@ func usage() {
vmbackup performs backups for VictoriaMetrics data from instant snapshots to gcs, s3
or local filesystem. Backed up data can be restored with vmrestore.
See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md .
See the docs at https://victoriametrics.github.io/vbackup.html .
`
f := flag.CommandLine.Output()
fmt.Fprintf(f, "%s\n", s)
flag.PrintDefaults()
flagutil.Usage(s)
}
func newSrcFS() (*fslocal.FS, error) {
@@ -110,12 +113,12 @@ func newSrcFS() (*fslocal.FS, error) {
// Verify the snapshot exists.
f, err := os.Open(snapshotPath)
if err != nil {
return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
}
fi, err := f.Stat()
_ = f.Close()
if err != nil {
return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
}
if !fi.IsDir() {
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
@@ -123,10 +126,10 @@ func newSrcFS() (*fslocal.FS, error) {
fs := &fslocal.FS{
Dir: snapshotPath,
MaxBytesPerSecond: *maxBytesPerSecond,
MaxBytesPerSecond: maxBytesPerSecond.N,
}
if err := fs.Init(); err != nil {
return nil, fmt.Errorf("cannot initialize fs: %s", err)
return nil, fmt.Errorf("cannot initialize fs: %w", err)
}
return fs, nil
}
@@ -134,18 +137,18 @@ func newSrcFS() (*fslocal.FS, error) {
func newDstFS() (common.RemoteFS, error) {
fs, err := actions.NewRemoteFS(*dst)
if err != nil {
return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
}
return fs, nil
}
func newOriginFS() (common.RemoteFS, error) {
func newOriginFS() (common.OriginFS, error) {
if len(*origin) == 0 {
return nil, nil
return &fsnil.FS{}, nil
}
fs, err := actions.NewRemoteFS(*origin)
if err != nil {
return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
}
return fs, nil
}

View File

@@ -20,26 +20,27 @@ type snapshot struct {
// Create creates a snapshot and the provided api endpoint and returns
// the snapshot name
func Create(createSnapshotURL string) (string, error) {
logger.Infof("%s", "Creating snapshot")
logger.Infof("Creating snapshot")
u, err := url.Parse(createSnapshotURL)
if err != nil {
return "", err
}
resp, err := http.Get(u.String())
if err != nil {
return "", err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code returned from %q; expecting %d; got %d; response body: %q", createSnapshotURL, resp.StatusCode, http.StatusOK, body)
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return "", err
return "", fmt.Errorf("cannot parse JSON response from %q: %w; response body: %q", createSnapshotURL, err, body)
}
if snap.Status == "ok" {
@@ -58,26 +59,26 @@ func Delete(deleteSnapshotURL string, snapshotName string) error {
formData := url.Values{
"snapshot": {snapshotName},
}
u, err := url.Parse(deleteSnapshotURL)
if err != nil {
return err
}
resp, err := http.PostForm(u.String(), formData)
if err != nil {
return err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status code returned from %q; expecting %d; got %d; response body: %q", deleteSnapshotURL, resp.StatusCode, http.StatusOK, body)
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return err
return fmt.Errorf("cannot parse JSON response from %q: %w; response body: %q", deleteSnapshotURL, err, body)
}
if snap.Status == "ok" {

73
app/vmctl/Makefile Normal file
View File

@@ -0,0 +1,73 @@
# All these commands must run from repository root.
vmctl:
APP_NAME=vmctl $(MAKE) app-local
vmctl-race:
APP_NAME=vmctl RACE=-race $(MAKE) app-local
vmctl-prod:
APP_NAME=vmctl $(MAKE) app-via-docker
vmctl-pure-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-pure
vmctl-amd64-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-amd64
vmctl-arm-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-arm
vmctl-arm64-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-arm64
vmctl-ppc64le-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-ppc64le
vmctl-386-prod:
APP_NAME=vmctl $(MAKE) app-via-docker-386
package-vmctl:
APP_NAME=vmctl $(MAKE) package-via-docker
package-vmctl-pure:
APP_NAME=vmctl $(MAKE) package-via-docker-pure
package-vmctl-amd64:
APP_NAME=vmctl $(MAKE) package-via-docker-amd64
package-vmctl-arm:
APP_NAME=vmctl $(MAKE) package-via-docker-arm
package-vmctl-arm64:
APP_NAME=vmctl $(MAKE) package-via-docker-arm64
package-vmctl-ppc64le:
APP_NAME=vmctl $(MAKE) package-via-docker-ppc64le
package-vmctl-386:
APP_NAME=vmctl $(MAKE) package-via-docker-386
publish-vmctl:
APP_NAME=vmctl $(MAKE) publish-via-docker
vmctl-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmctl-local-with-goarch
vmctl-arm:
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmctl-local-with-goarch
vmctl-arm64:
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmctl-local-with-goarch
vmctl-ppc64le:
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmctl-local-with-goarch
vmctl-386:
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmctl-local-with-goarch
vmctl-local-with-goarch:
APP_NAME=vmctl $(MAKE) app-local-with-goarch
vmctl-pure:
APP_NAME=vmctl $(MAKE) app-local-pure

473
app/vmctl/README.md Normal file
View File

@@ -0,0 +1,473 @@
# vmctl - Victoria metrics command-line tool
Features:
- [x] Prometheus: migrate data from Prometheus to VictoriaMetrics using snapshot API
- [x] Thanos: migrate data from Thanos to VictoriaMetrics
- [ ] ~~Prometheus: migrate data from Prometheus to VictoriaMetrics by query~~(discarded)
- [x] InfluxDB: migrate data from InfluxDB to VictoriaMetrics
- [ ] Storage Management: data re-balancing between nodes
# Table of contents
* [Articles](#articles)
* [How to build](#how-to-build)
* [Migrating data from InfluxDB 1.x](#migrating-data-from-influxdb-1x)
* [Data mapping](#data-mapping)
* [Configuration](#configuration)
* [Filtering](#filtering)
* [Migrating data from InfluxDB 2.x](#migrating-data-from-influxdb-2x)
* [Migrating data from Prometheus](#migrating-data-from-prometheus)
* [Data mapping](#data-mapping-1)
* [Configuration](#configuration-1)
* [Filtering](#filtering-1)
* [Migrating data from Thanos](#migrating-data-from-thanos)
* [Current data](#current-data)
* [Historical data](#historical-data)
* [Migrating data from VictoriaMetrics](#migrating-data-from-victoriametrics)
* [Native protocol](#native-protocol)
* [Tuning](#tuning)
* [Influx mode](#influx-mode)
* [Prometheus mode](#prometheus-mode)
* [VictoriaMetrics importer](#victoriametrics-importer)
* [Importer stats](#importer-stats)
* [Significant figures](#significant-figures)
* [Adding extra labels](#adding-extra-labels)
## Articles
* [How to migrate data from Prometheus](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043)
* [How to migrate data from Prometheus. Filtering and modifying time series](https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-filtering-and-modifying-time-series-6d40cea4bf21)
## How to build
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmctl` is located in `vmutils-*` archives there.
### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmctl` from the root folder of the repository.
It builds `vmctl` binary and puts it into the `bin` folder.
### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmctl-prod` from the root folder of the repository.
It builds `vmctl-prod` binary and puts it into the `bin` folder.
### Building docker images
Run `make package-vmctl`. It builds `victoriametrics/vmctl:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmctl`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=scratch make package-vmctl
```
### ARM build
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
#### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmctl-arm` or `make vmctl-arm64` from the root folder of the repository.
It builds `vmctl-arm` or `vmctl-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmctl-arm-prod` or `make vmctl-arm64-prod` from the root folder of the repository.
It builds `vmctl-arm-prod` or `vmctl-arm64-prod` binary respectively and puts it into the `bin` folder.
## Migrating data from InfluxDB (1.x)
`vmctl` supports the `influx` mode to migrate data from InfluxDB to VictoriaMetrics time-series database.
See `./vmctl influx --help` for details and full list of flags.
To use migration tool please specify the InfluxDB address `--influx-addr`, the database `--influx-database` and VictoriaMetrics address `--vm-addr`.
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
As soon as required flags are provided and all endpoints are accessible, `vmctl` will start the InfluxDB scheme exploration.
Basically, it just fetches all fields and timeseries from the provided database and builds up registry of all available timeseries.
Then `vmctl` sends fetch requests for each timeseries to InfluxDB one by one and pass results to VM importer.
VM importer then accumulates received samples in batches and sends import requests to VM.
The importing process example for local installation of InfluxDB(`http://localhost:8086`)
and single-node VictoriaMetrics(`http://localhost:8428`):
```
./vmctl influx --influx-database benchmark
InfluxDB import mode
2020/01/18 20:47:11 Exploring scheme for database "benchmark"
2020/01/18 20:47:11 fetching fields: command: "show field keys"; database: "benchmark"; retention: "autogen"
2020/01/18 20:47:11 found 10 fields
2020/01/18 20:47:11 fetching series: command: "show series "; database: "benchmark"; retention: "autogen"
Found 40000 timeseries to import. Continue? [Y/n] y
40000 / 40000 [-----------------------------------------------------------------------------------------------------------------------------------------------] 100.00% 21 p/s
2020/01/18 21:19:00 Import finished!
2020/01/18 21:19:00 VictoriaMetrics importer stats:
idle duration: 13m51.461434876s;
time spent while importing: 17m56.923899847s;
total samples: 345600000;
samples/s: 320914.04;
total bytes: 5.9 GB;
bytes/s: 5.4 MB;
import requests: 40001;
2020/01/18 21:19:00 Total time: 31m48.467044016s
```
### Data mapping
Vmctl maps Influx data the same way as VictoriaMetrics does by using the following rules:
* `influx-database` arg is mapped into `db` label value unless `db` tag exists in the Influx line.
* Field names are mapped to time series names prefixed with {measurement}{separator} value,
where {separator} equals to _ by default.
It can be changed with `--influx-measurement-field-separator` command-line flag.
* Field values are mapped to time series values.
* Tags are mapped to Prometheus labels format as-is.
For example, the following Influx line:
```
foo,tag1=value1,tag2=value2 field1=12,field2=40
```
is converted into the following Prometheus format data points:
```
foo_field1{tag1="value1", tag2="value2"} 12
foo_field2{tag1="value1", tag2="value2"} 40
```
### Configuration
The configuration flags should contain self-explanatory descriptions.
### Filtering
The filtering consists of two parts: timeseries and time.
The first step of application is to select all available timeseries
for given database and retention. User may specify additional filtering
condition via `--influx-filter-series` flag. For example:
```
./vmctl influx --influx-database benchmark \
--influx-filter-series "on benchmark from cpu where hostname='host_1703'"
InfluxDB import mode
2020/01/26 14:23:29 Exploring scheme for database "benchmark"
2020/01/26 14:23:29 fetching fields: command: "show field keys"; database: "benchmark"; retention: "autogen"
2020/01/26 14:23:29 found 12 fields
2020/01/26 14:23:29 fetching series: command: "show series on benchmark from cpu where hostname='host_1703'"; database: "benchmark"; retention: "autogen"
Found 10 timeseries to import. Continue? [Y/n]
```
The timeseries select query would be following:
`fetching series: command: "show series on benchmark from cpu where hostname='host_1703'"; database: "benchmark"; retention: "autogen"`
The second step of filtering is a time filter and it applies when fetching the datapoints from Influx.
Time filtering may be configured with two flags:
* --influx-filter-time-start
* --influx-filter-time-end
Here's an example of importing timeseries for one day only:
`./vmctl influx --influx-database benchmark --influx-filter-series "where hostname='host_1703'" --influx-filter-time-start "2020-01-01T10:07:00Z" --influx-filter-time-end "2020-01-01T15:07:00Z"`
Please see more about time filtering [here](https://docs.influxdata.com/influxdb/v1.7/query_language/schema_exploration#filter-meta-queries-by-time).
## Migrating data from InfluxDB (2.x)
Migrating data from InfluxDB v2.x is not supported yet ([#32](https://github.com/VictoriaMetrics/vmctl/issues/32)).
You may find useful a 3rd party solution for this - https://github.com/jonppe/influx_to_victoriametrics.
## Migrating data from Prometheus
`vmctl` supports the `prometheus` mode for migrating data from Prometheus to VictoriaMetrics time-series database.
Migration is based on reading Prometheus snapshot, which is basically a hard-link to Prometheus data files.
See `./vmctl prometheus --help` for details and full list of flags.
To use migration tool please specify the path to Prometheus snapshot `--prom-snapshot` and VictoriaMetrics address `--vm-addr`.
More about Prometheus snapshots may be found [here](https://www.robustperception.io/taking-snapshots-of-prometheus-data).
Flag `--vm-addr` for single-node VM is usually equal to `--httpListenAddr`, and for cluster version
is equal to `--httpListenAddr` flag of VMInsert component. Please note, that vmctl performs initial readiness check for the given address
by checking `/health` endpoint. For cluster version it is additionally required to specify the `--vm-account-id` flag.
See more details for cluster version [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
As soon as required flags are provided and all endpoints are accessible, `vmctl` will start the Prometheus snapshot exploration.
Basically, it just fetches all available blocks in provided snapshot and read the metadata. It also does initial filtering by time
if flags `--prom-filter-time-start` or `--prom-filter-time-end` were set. The exploration procedure prints some stats from read blocks.
Please note that stats are not taking into account timeseries or samples filtering. This will be done during importing process.
The importing process takes the snapshot blocks revealed from Explore procedure and processes them one by one
accumulating timeseries and samples. Please note, that `vmctl` relies on responses from Influx on this stage,
so ensure that Explore queries are executed without errors or limits. Please see this
[issue](https://github.com/VictoriaMetrics/vmctl/issues/30) for details.
The data processed in chunks and then sent to VM.
The importing process example for local installation of Prometheus
and single-node VictoriaMetrics(`http://localhost:8428`):
```
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
--vm-concurrency=1 \
--vm-batch-size=200000 \
--prom-concurrency=3
Prometheus import mode
Prometheus snapshot stats:
blocks found: 14;
blocks skipped: 0;
min time: 1581288163058 (2020-02-09T22:42:43Z);
max time: 1582409128139 (2020-02-22T22:05:28Z);
samples: 32549106;
series: 27289.
Found 14 blocks to import. Continue? [Y/n] y
14 / 14 [-------------------------------------------------------------------------------------------] 100.00% 0 p/s
2020/02/23 15:50:03 Import finished!
2020/02/23 15:50:03 VictoriaMetrics importer stats:
idle duration: 6.152953029s;
time spent while importing: 44.908522491s;
total samples: 32549106;
samples/s: 724786.84;
total bytes: 669.1 MB;
bytes/s: 14.9 MB;
import requests: 323;
import requests retries: 0;
2020/02/23 15:50:03 Total time: 51.077451066s
```
### Data mapping
VictoriaMetrics has very similar data model to Prometheus and supports [RemoteWrite integration](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage).
So no data changes will be applied.
### Configuration
The configuration flags should contain self-explanatory descriptions.
### Filtering
The filtering consists of three parts: by timeseries and time.
Filtering by time may be configured via flags `--prom-filter-time-start` and `--prom-filter-time-end`
in in RFC3339 format. This filter applied twice: to drop blocks out of range and to filter timeseries in blocks with
overlapping time range.
Example of applying time filter:
```
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
--prom-filter-time-start=2020-02-07T00:07:01Z \
--prom-filter-time-end=2020-02-11T00:07:01Z
Prometheus import mode
Prometheus snapshot stats:
blocks found: 2;
blocks skipped: 12;
min time: 1581288163058 (2020-02-09T22:42:43Z);
max time: 1581328800000 (2020-02-10T10:00:00Z);
samples: 1657698;
series: 3930.
Found 2 blocks to import. Continue? [Y/n] y
```
Please notice, that total amount of blocks in provided snapshot is 14, but only 2 of them were in provided
time range. So other 12 blocks were marked as `skipped`. The amount of samples and series is not taken into account,
since this is heavy operation and will be done during import process.
Filtering by timeseries is configured with following flags:
* `--prom-filter-label` - the label name, e.g. `__name__` or `instance`;
* `--prom-filter-label-value` - the regular expression to filter the label value. By default matches all `.*`
For example:
```
./vmctl prometheus --prom-snapshot=/path/to/snapshot \
--prom-filter-label="__name__" \
--prom-filter-label-value="promhttp.*" \
--prom-filter-time-start=2020-02-07T00:07:01Z \
--prom-filter-time-end=2020-02-11T00:07:01Z
Prometheus import mode
Prometheus snapshot stats:
blocks found: 2;
blocks skipped: 12;
min time: 1581288163058 (2020-02-09T22:42:43Z);
max time: 1581328800000 (2020-02-10T10:00:00Z);
samples: 1657698;
series: 3930.
Found 2 blocks to import. Continue? [Y/n] y
14 / 14 [------------------------------------------------------------------------------------------------------------------------------------------------------] 100.00% ? p/s
2020/02/23 15:51:07 Import finished!
2020/02/23 15:51:07 VictoriaMetrics importer stats:
idle duration: 0s;
time spent while importing: 37.415461ms;
total samples: 10128;
samples/s: 270690.24;
total bytes: 195.2 kB;
bytes/s: 5.2 MB;
import requests: 2;
import requests retries: 0;
2020/02/23 15:51:07 Total time: 7.153158218s
```
## Migrating data from Thanos
Thanos uses the same storage engine as Prometheus and the data layout on-disk should be the same. That means
`vmctl` in mode `prometheus` may be used for Thanos historical data migration as well.
These instructions may vary based on the details of your Thanos configuration.
Please read carefully and verify as you go. We assume you're using Thanos Sidecar on your Prometheus pods,
and that you have a separate Thanos Store installation.
### Current data
1. For now, keep your Thanos Sidecar and Thanos-related Prometheus configuration, but add this to also stream
metrics to VictoriaMetrics:
```
remote_write:
- url: http://victoria-metrics:8428/api/v1/write
```
2. Make sure VM is running, of course. Now check the logs to make sure that Prometheus is sending and VM is receiving.
In Prometheus, make sure there are no errors. On the VM side, you should see messages like this:
```
2020-04-27T18:38:46.474Z info VictoriaMetrics/lib/storage/partition.go:207 creating a partition "2020_04" with smallPartsPath="/victoria-metrics-data/data/small/2020_04", bigPartsPath="/victoria-metrics-data/data/big/2020_04"
2020-04-27T18:38:46.506Z info VictoriaMetrics/lib/storage/partition.go:222 partition "2020_04" has been created
```
3. Now just wait. Within two hours, Prometheus should finish its current data file and hand it off to Thanos Store for long term
storage.
### Historical data
Let's assume your data is stored on S3 served by minio. You first need to copy that out to a local filesystem,
then import it into VM using `vmctl` in `prometheus` mode.
1. Copy data from minio.
1. Run the `minio/mc` Docker container.
1. `mc config host add minio http://minio:9000 accessKey secretKey`, substituting appropriate values for the last 3 items.
1. `mc cp -r minio/prometheus thanos-data`
1. Import using `vmctl`.
1. Follow the [instructions](#how-to-build) to compile `vmctl` on your machine.
1. Use [prometheus](#migrating-data-from-prometheus) mode to import data:
```
vmctl prometheus --prom-snapshot thanos-data --vm-addr http://victoria-metrics:8428
```
## Migrating data from VictoriaMetrics
### Native protocol
The [native binary protocol](https://victoriametrics.github.io/#how-to-export-data-in-native-format)
was introduced in [1.42.0 release](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.42.0)
and provides the most efficient way to migrate data between VM instances: single to single, cluster to cluster,
single to cluster and vice versa. Please note that both instances (source and destination) should be of v1.42.0
or higher.
See `./vmctl vm-native --help` for details and full list of flags.
In this mode `vmctl` acts as a proxy between two VM instances, where time series filtering is done by "source" (`src`)
and processing is done by "destination" (`dst`). Because of that, `vmctl` doesn't actually know how much data will be
processed and can't show the progress bar. It will show the current processing speed and total number of processed bytes:
```
./vmctl vm-native --vm-native-src-addr=http://localhost:8528 \
--vm-native-dst-addr=http://localhost:8428 \
--vm-native-filter-match='{job="vmagent"}' \
--vm-native-filter-time-start='2020-01-01T20:07:00Z'
VictoriaMetrics Native import mode
Initing export pipe from "http://localhost:8528" with filters:
filter: match[]={job="vmagent"}
Initing import process to "http://localhost:8428":
Total: 336.75 KiB ↖ Speed: 454.46 KiB p/s
2020/10/13 17:04:59 Total time: 952.143376ms
```
Importing tips:
1. Migrating all the metrics from one VM to another may collide with existing application metrics
(prefixed with `vm_`) at destination and lead to confusion when using
[official Grafana dashboards](https://grafana.com/orgs/victoriametrics/dashboards).
To avoid such situation try to filter out VM process metrics via `--vm-native-filter-match` flag.
2. Migration is a backfilling process, so it is recommended to read
[Backfilling tips](https://github.com/VictoriaMetrics/VictoriaMetrics#backfilling) section.
3. `vmctl` doesn't provide relabeling or other types of labels management in this mode.
Instead, use [relabeling in VictoriaMetrics](https://github.com/VictoriaMetrics/vmctl/issues/4#issuecomment-683424375).
## Tuning
### Influx mode
The flag `--influx-concurrency` controls how many concurrent requests may be sent to InfluxDB while fetching
timeseries. Please set it wisely to avoid InfluxDB overwhelming.
The flag `--influx-chunk-size` controls the max amount of datapoints to return in single chunk from fetch requests.
Please see more details [here](https://docs.influxdata.com/influxdb/v1.7/guides/querying_data/#chunking).
The chunk size is used to control InfluxDB memory usage, so it won't OOM on processing large timeseries with
billions of datapoints.
### Prometheus mode
The flag `--prom-concurrency` controls how many concurrent readers will be reading the blocks in snapshot.
Since snapshots are just files on disk it would be hard to overwhelm the system. Please go with value equal
to number of free CPU cores.
### VictoriaMetrics importer
The flag `--vm-concurrency` controls the number of concurrent workers that process the input from InfluxDB query results.
Please note that each import request can load up to a single vCPU core on VictoriaMetrics. So try to set it according
to allocated CPU resources of your VictoriMetrics installation.
The flag `--vm-batch-size` controls max amount of samples collected before sending the import request.
For example, if `--influx-chunk-size=500` and `--vm-batch-size=2000` then importer will process not more
than 4 chunks before sending the request.
### Importer stats
After successful import `vmctl` prints some statistics for details.
The important numbers to watch are following:
- `idle duration` - shows time that importer spent while waiting for data from InfluxDB/Prometheus
to fill up `--vm-batch-size` batch size. Value shows total duration across all workers configured
via `--vm-concurrency`. High value may be a sign of too slow InfluxDB/Prometheus fetches or too
high `--vm-concurrency` value. Try to improve it by increasing `--<mode>-concurrency` value or
decreasing `--vm-concurrency` value.
- `import requests` - shows how many import requests were issued to VM server.
The import request is issued once the batch size(`--vm-batch-size`) is full and ready to be sent.
Please prefer big batch sizes (50k-500k) to improve performance.
- `import requests retries` - shows number of unsuccessful import requests. Non-zero value may be
a sign of network issues or VM being overloaded. See the logs during import for error messages.
### Silent mode
By default `vmctl` waits confirmation from user before starting the import. If this is unwanted
behavior and no user interaction required - pass `-s` flag to enable "silence" mode:
```
-s Whether to run in silent mode. If set to true no confirmation prompts will appear. (default: false)
```
### Significant figures
`vmctl` allows to limit the number of [significant figures](https://en.wikipedia.org/wiki/Significant_figures)
before importing. For example, the average value for response size is `102.342305` bytes and it has 9 significant figures.
If you ask a human to pronounce this value then with high probability value will be rounded to first 4 or 5 figures
because the rest aren't really that important to mention. In most cases, such a high precision is too much.
Moreover, such values may be just a result of [floating point arithmetic](https://en.wikipedia.org/wiki/Floating-point_arithmetic),
create a [false precision](https://en.wikipedia.org/wiki/False_precision) and result into bad compression ratio
according to [information theory](https://en.wikipedia.org/wiki/Information_theory).
`vmctl` provides the following flags for improving data compression:
* `--vm-round-digits` flag for rounding processed values to the given number of decimal digits after the point.
For example, `--vm-round-digits=2` would round `1.2345` to `1.23`. By default the rounding is disabled.
* `--vm-significant-figures` flag for limiting the number of significant figures in processed values. It takes no effect if set
to 0 (by default), but set `--vm-significant-figures=5` and `102.342305` will be rounded to `102.34`.
The most common case for using these flags is to improve data compression for time series storing aggregation
results such as `average`, `rate`, etc.
### Adding extra labels
`vmctl` allows to add extra labels to all imported series. It can be achived with flag `--vm-extra-label label=value`.
If multiple labels needs to be added, set flag for each label, for example, `--vm-extra-label label1=value1 --vm-extra-label label2=value2`.
If timeseries already have label, that must be added with `--vm-extra-label` flag, flag has priority and will override label value from timeseries.

View File

@@ -0,0 +1,6 @@
ARG base_image
FROM $base_image
ENTRYPOINT ["/vmctl-prod"]
ARG src_binary
COPY $src_binary ./vmctl-prod

292
app/vmctl/flags.go Normal file
View File

@@ -0,0 +1,292 @@
package main
import (
"fmt"
"github.com/urfave/cli/v2"
)
const (
globalSilent = "s"
)
var (
globalFlags = []cli.Flag{
&cli.BoolFlag{
Name: globalSilent,
Value: false,
Usage: "Whether to run in silent mode. If set to true no confirmation prompts will appear.",
},
}
)
const (
vmAddr = "vm-addr"
vmUser = "vm-user"
vmPassword = "vm-password"
vmAccountID = "vm-account-id"
vmConcurrency = "vm-concurrency"
vmCompress = "vm-compress"
vmBatchSize = "vm-batch-size"
vmSignificantFigures = "vm-significant-figures"
vmRoundDigits = "vm-round-digits"
vmExtraLabel = "vm-extra-label"
)
var (
vmFlags = []cli.Flag{
&cli.StringFlag{
Name: vmAddr,
Value: "http://localhost:8428",
Usage: "VictoriaMetrics address to perform import requests. \n" +
"Should be the same as --httpListenAddr value for single-node version or VMInsert component. \n" +
"Please note, that `vmctl` performs initial readiness check for the given address by checking `/health` endpoint.",
},
&cli.StringFlag{
Name: vmUser,
Usage: "VictoriaMetrics username for basic auth",
EnvVars: []string{"VM_USERNAME"},
},
&cli.StringFlag{
Name: vmPassword,
Usage: "VictoriaMetrics password for basic auth",
EnvVars: []string{"VM_PASSWORD"},
},
&cli.StringFlag{
Name: vmAccountID,
Usage: "AccountID is an arbitrary 32-bit integer identifying namespace for data ingestion (aka tenant). \n" +
"It is possible to set it as accountID:projectID, where projectID is also arbitrary 32-bit integer. \n" +
"If projectID isn't set, then it equals to 0",
},
&cli.UintFlag{
Name: vmConcurrency,
Usage: "Number of workers concurrently performing import requests to VM",
Value: 2,
},
&cli.BoolFlag{
Name: vmCompress,
Value: true,
Usage: "Whether to apply gzip compression to import requests",
},
&cli.IntFlag{
Name: vmBatchSize,
Value: 200e3,
Usage: "How many samples importer collects before sending the import request to VM",
},
&cli.IntFlag{
Name: vmSignificantFigures,
Value: 0,
Usage: "The number of significant figures to leave in metric values before importing. " +
"See https://en.wikipedia.org/wiki/Significant_figures. Zero value saves all the significant figures. " +
"This option may be used for increasing on-disk compression level for the stored metrics. " +
"See also --vm-round-digits option",
},
&cli.IntFlag{
Name: vmRoundDigits,
Value: 100,
Usage: "Round metric values to the given number of decimal digits after the point. " +
"This option may be used for increasing on-disk compression level for the stored metrics",
},
&cli.StringSliceFlag{
Name: vmExtraLabel,
Value: nil,
Usage: "Extra labels, that will be added to imported timeseries. In case of collision, label value defined by flag" +
"will have priority. Flag can be set multiple times, to add few additional labels.",
},
}
)
const (
influxAddr = "influx-addr"
influxUser = "influx-user"
influxPassword = "influx-password"
influxDB = "influx-database"
influxRetention = "influx-retention-policy"
influxChunkSize = "influx-chunk-size"
influxConcurrency = "influx-concurrency"
influxFilterSeries = "influx-filter-series"
influxFilterTimeStart = "influx-filter-time-start"
influxFilterTimeEnd = "influx-filter-time-end"
influxMeasurementFieldSeparator = "influx-measurement-field-separator"
)
var (
influxFlags = []cli.Flag{
&cli.StringFlag{
Name: influxAddr,
Value: "http://localhost:8086",
Usage: "Influx server addr",
},
&cli.StringFlag{
Name: influxUser,
Usage: "Influx user",
EnvVars: []string{"INFLUX_USERNAME"},
},
&cli.StringFlag{
Name: influxPassword,
Usage: "Influx user password",
EnvVars: []string{"INFLUX_PASSWORD"},
},
&cli.StringFlag{
Name: influxDB,
Usage: "Influx database",
Required: true,
},
&cli.StringFlag{
Name: influxRetention,
Usage: "Influx retention policy",
Value: "autogen",
},
&cli.IntFlag{
Name: influxChunkSize,
Usage: "The chunkSize defines max amount of series to be returned in one chunk",
Value: 10e3,
},
&cli.IntFlag{
Name: influxConcurrency,
Usage: "Number of concurrently running fetch queries to InfluxDB",
Value: 1,
},
&cli.StringFlag{
Name: influxFilterSeries,
Usage: "Influx filter expression to select series. E.g. \"from cpu where arch='x86' AND hostname='host_2753'\".\n" +
"See for details https://docs.influxdata.com/influxdb/v1.7/query_language/schema_exploration#show-series",
},
&cli.StringFlag{
Name: influxFilterTimeStart,
Usage: "The time filter to select timeseries with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: influxFilterTimeEnd,
Usage: "The time filter to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: influxMeasurementFieldSeparator,
Usage: "The {separator} symbol used to concatenate {measurement} and {field} names into series name {measurement}{separator}{field}.",
Value: "_",
},
}
)
const (
promSnapshot = "prom-snapshot"
promConcurrency = "prom-concurrency"
promFilterTimeStart = "prom-filter-time-start"
promFilterTimeEnd = "prom-filter-time-end"
promFilterLabel = "prom-filter-label"
promFilterLabelValue = "prom-filter-label-value"
)
var (
promFlags = []cli.Flag{
&cli.StringFlag{
Name: promSnapshot,
Usage: "Path to Prometheus snapshot. Pls see for details https://www.robustperception.io/taking-snapshots-of-prometheus-data",
Required: true,
},
&cli.IntFlag{
Name: promConcurrency,
Usage: "Number of concurrently running snapshot readers",
Value: 1,
},
&cli.StringFlag{
Name: promFilterTimeStart,
Usage: "The time filter in RFC3339 format to select timeseries with timestamp equal or higher than provided value. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: promFilterTimeEnd,
Usage: "The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: promFilterLabel,
Usage: "Prometheus label name to filter timeseries by. E.g. '__name__' will filter timeseries by name.",
},
&cli.StringFlag{
Name: promFilterLabelValue,
Usage: fmt.Sprintf("Prometheus regular expression to filter label from %q flag.", promFilterLabel),
Value: ".*",
},
}
)
const (
vmNativeFilterMatch = "vm-native-filter-match"
vmNativeFilterTimeStart = "vm-native-filter-time-start"
vmNativeFilterTimeEnd = "vm-native-filter-time-end"
vmNativeSrcAddr = "vm-native-src-addr"
vmNativeSrcUser = "vm-native-src-user"
vmNativeSrcPassword = "vm-native-src-password"
vmNativeDstAddr = "vm-native-dst-addr"
vmNativeDstUser = "vm-native-dst-user"
vmNativeDstPassword = "vm-native-dst-password"
)
var (
vmNativeFlags = []cli.Flag{
&cli.StringFlag{
Name: vmNativeFilterMatch,
Usage: "Time series selector to match series for export. For example, select {instance!=\"localhost\"} will " +
"match all series with \"instance\" label different to \"localhost\".\n" +
" See more details here https://github.com/VictoriaMetrics/VictoriaMetrics#how-to-export-data-in-native-format",
Value: `{__name__!=""}`,
},
&cli.StringFlag{
Name: vmNativeFilterTimeStart,
Usage: "The time filter may contain either unix timestamp in seconds or RFC3339 values. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: vmNativeFilterTimeEnd,
Usage: "The time filter may contain either unix timestamp in seconds or RFC3339 values. E.g. '2020-01-01T20:07:00Z'",
},
&cli.StringFlag{
Name: vmNativeSrcAddr,
Usage: "VictoriaMetrics address to perform export from. \n" +
" Should be the same as --httpListenAddr value for single-node version or VMSelect component." +
" If exporting from cluster version - include the tenet token in address.",
Required: true,
},
&cli.StringFlag{
Name: vmNativeSrcUser,
Usage: "VictoriaMetrics username for basic auth",
EnvVars: []string{"VM_NATIVE_SRC_USERNAME"},
},
&cli.StringFlag{
Name: vmNativeSrcPassword,
Usage: "VictoriaMetrics password for basic auth",
EnvVars: []string{"VM_NATIVE_SRC_PASSWORD"},
},
&cli.StringFlag{
Name: vmNativeDstAddr,
Usage: "VictoriaMetrics address to perform import to. \n" +
" Should be the same as --httpListenAddr value for single-node version or VMInsert component." +
" If importing into cluster version - include the tenet token in address.",
Required: true,
},
&cli.StringFlag{
Name: vmNativeDstUser,
Usage: "VictoriaMetrics username for basic auth",
EnvVars: []string{"VM_NATIVE_DST_USERNAME"},
},
&cli.StringFlag{
Name: vmNativeDstPassword,
Usage: "VictoriaMetrics password for basic auth",
EnvVars: []string{"VM_NATIVE_DST_PASSWORD"},
},
&cli.StringSliceFlag{
Name: vmExtraLabel,
Value: nil,
Usage: "Extra labels, that will be added to imported timeseries. In case of collision, label value defined by flag" +
"will have priority. Flag can be set multiple times, to add few additional labels.",
},
}
)
func mergeFlags(flags ...[]cli.Flag) []cli.Flag {
var result []cli.Flag
for _, f := range flags {
result = append(result, f...)
}
return result
}

146
app/vmctl/influx.go Normal file
View File

@@ -0,0 +1,146 @@
package main
import (
"fmt"
"io"
"log"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/cheggaaa/pb/v3"
)
type influxProcessor struct {
ic *influx.Client
im *vm.Importer
cc int
separator string
}
func newInfluxProcessor(ic *influx.Client, im *vm.Importer, cc int, separator string) *influxProcessor {
if cc < 1 {
cc = 1
}
return &influxProcessor{
ic: ic,
im: im,
cc: cc,
separator: separator,
}
}
func (ip *influxProcessor) run(silent bool) error {
series, err := ip.ic.Explore()
if err != nil {
return fmt.Errorf("explore query failed: %s", err)
}
if len(series) < 1 {
return fmt.Errorf("found no timeseries to import")
}
question := fmt.Sprintf("Found %d timeseries to import. Continue?", len(series))
if !silent && !prompt(question) {
return nil
}
bar := pb.StartNew(len(series))
seriesCh := make(chan *influx.Series)
errCh := make(chan error)
ip.im.ResetStats()
var wg sync.WaitGroup
wg.Add(ip.cc)
for i := 0; i < ip.cc; i++ {
go func() {
defer wg.Done()
for s := range seriesCh {
if err := ip.do(s); err != nil {
errCh <- fmt.Errorf("request failed for %q.%q: %s", s.Measurement, s.Field, err)
return
}
bar.Increment()
}
}()
}
// any error breaks the import
for _, s := range series {
select {
case infErr := <-errCh:
return fmt.Errorf("influx error: %s", infErr)
case vmErr := <-ip.im.Errors():
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
case seriesCh <- s:
}
}
close(seriesCh)
wg.Wait()
ip.im.Close()
// drain import errors channel
for vmErr := range ip.im.Errors() {
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
}
bar.Finish()
log.Println("Import finished!")
log.Print(ip.im.Stats())
return nil
}
const dbLabel = "db"
func (ip *influxProcessor) do(s *influx.Series) error {
cr, err := ip.ic.FetchDataPoints(s)
if err != nil {
return fmt.Errorf("failed to fetch datapoints: %s", err)
}
defer func() {
_ = cr.Close()
}()
var name string
if s.Measurement != "" {
name = fmt.Sprintf("%s%s%s", s.Measurement, ip.separator, s.Field)
} else {
name = s.Field
}
labels := make([]vm.LabelPair, len(s.LabelPairs))
var containsDBLabel bool
for i, lp := range s.LabelPairs {
if lp.Name == dbLabel {
containsDBLabel = true
break
}
labels[i] = vm.LabelPair{
Name: lp.Name,
Value: lp.Value,
}
}
if !containsDBLabel {
labels = append(labels, vm.LabelPair{
Name: dbLabel,
Value: ip.ic.Database(),
})
}
for {
time, values, err := cr.Next()
if err != nil {
if err == io.EOF {
return nil
}
return err
}
// skip empty results
if len(time) < 1 {
continue
}
ip.im.Input() <- &vm.TimeSeries{
Name: name,
LabelPairs: labels,
Timestamps: time,
Values: values,
}
}
}

362
app/vmctl/influx/influx.go Normal file
View File

@@ -0,0 +1,362 @@
package influx
import (
"fmt"
"io"
"log"
"strings"
"time"
influx "github.com/influxdata/influxdb/client/v2"
)
// Client represents a wrapper over
// influx HTTP client
type Client struct {
influx.Client
database string
retention string
chunkSize int
filterSeries string
filterTime string
}
// Config contains fields required
// for Client configuration
type Config struct {
Addr string
Username string
Password string
Database string
Retention string
ChunkSize int
Filter Filter
}
// Filter contains configuration for filtering
// the timeseries
type Filter struct {
Series string
TimeStart string
TimeEnd string
}
// Series holds the time series
type Series struct {
Measurement string
Field string
LabelPairs []LabelPair
}
var valueEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`)
func (s Series) fetchQuery(timeFilter string) string {
f := &strings.Builder{}
fmt.Fprintf(f, "select %q from %q", s.Field, s.Measurement)
if len(s.LabelPairs) > 0 || len(timeFilter) > 0 {
f.WriteString(" where")
}
for i, pair := range s.LabelPairs {
pairV := valueEscaper.Replace(pair.Value)
fmt.Fprintf(f, " %q='%s'", pair.Name, pairV)
if i != len(s.LabelPairs)-1 {
f.WriteString(" and")
}
}
if len(timeFilter) > 0 {
if len(s.LabelPairs) > 0 {
f.WriteString(" and")
}
fmt.Fprintf(f, " %s", timeFilter)
}
return f.String()
}
// LabelPair is the key-value record
// of time series label
type LabelPair struct {
Name string
Value string
}
// NewClient creates and returns influx client
// configured with passed Config
func NewClient(cfg Config) (*Client, error) {
c := influx.HTTPConfig{
Addr: cfg.Addr,
Username: cfg.Username,
Password: cfg.Password,
InsecureSkipVerify: true,
}
hc, err := influx.NewHTTPClient(c)
if err != nil {
return nil, fmt.Errorf("failed to establish conn: %s", err)
}
if _, _, err := hc.Ping(time.Second); err != nil {
return nil, fmt.Errorf("ping failed: %s", err)
}
chunkSize := cfg.ChunkSize
if chunkSize < 1 {
chunkSize = 10e3
}
client := &Client{
Client: hc,
database: cfg.Database,
retention: cfg.Retention,
chunkSize: chunkSize,
filterTime: timeFilter(cfg.Filter.TimeStart, cfg.Filter.TimeEnd),
filterSeries: cfg.Filter.Series,
}
return client, nil
}
// Database returns database name
func (c Client) Database() string {
return c.database
}
func timeFilter(start, end string) string {
if start == "" && end == "" {
return ""
}
var tf string
if start != "" {
tf = fmt.Sprintf("time >= '%s'", start)
}
if end != "" {
if tf != "" {
tf += " and "
}
tf += fmt.Sprintf("time <= '%s'", end)
}
return tf
}
// Explore checks the existing data schema in influx
// by checking available fields and series,
// which unique combination represents all possible
// time series existing in database.
// The explore required to reduce the load on influx
// by querying field of the exact time series at once,
// instead of fetching all of the values over and over.
//
// May contain non-existing time series.
func (c *Client) Explore() ([]*Series, error) {
log.Printf("Exploring scheme for database %q", c.database)
mFields, err := c.fieldsByMeasurement()
if err != nil {
return nil, fmt.Errorf("failed to get field keys: %s", err)
}
series, err := c.getSeries()
if err != nil {
return nil, fmt.Errorf("failed to get series: %s", err)
}
var iSeries []*Series
for _, s := range series {
fields, ok := mFields[s.Measurement]
if !ok {
return nil, fmt.Errorf("can't find field keys for measurement %q", s.Measurement)
}
for _, field := range fields {
is := &Series{
Measurement: s.Measurement,
Field: field,
LabelPairs: s.LabelPairs,
}
iSeries = append(iSeries, is)
}
}
return iSeries, nil
}
// ChunkedResponse is a wrapper over influx.ChunkedResponse.
// Used for better memory usage control while iterating
// over huge time series.
type ChunkedResponse struct {
cr *influx.ChunkedResponse
iq influx.Query
field string
}
// Close closes cr.
func (cr *ChunkedResponse) Close() error {
return cr.cr.Close()
}
// Next reads the next part/chunk of time series.
// Returns io.EOF when time series was read entirely.
func (cr *ChunkedResponse) Next() ([]int64, []float64, error) {
resp, err := cr.cr.NextResponse()
if err != nil {
return nil, nil, err
}
if resp.Error() != nil {
return nil, nil, fmt.Errorf("response error for %s: %s", cr.iq.Command, resp.Error())
}
if len(resp.Results) != 1 {
return nil, nil, fmt.Errorf("unexpected number of results in response: %d", len(resp.Results))
}
results, err := parseResult(resp.Results[0])
if err != nil {
return nil, nil, err
}
if len(results) < 1 {
return nil, nil, nil
}
r := results[0]
const key = "time"
timestamps, ok := r.values[key]
if !ok {
return nil, nil, fmt.Errorf("response doesn't contain field %q", key)
}
fieldValues, ok := r.values[cr.field]
if !ok {
return nil, nil, fmt.Errorf("response doesn't contain filed %q", cr.field)
}
values := make([]float64, len(fieldValues))
for i, fv := range fieldValues {
v, err := toFloat64(fv)
if err != nil {
return nil, nil, fmt.Errorf("failed to convert value %q.%v to float64: %s",
cr.field, v, err)
}
values[i] = v
}
ts := make([]int64, len(results[0].values[key]))
for i, v := range timestamps {
t, err := parseDate(v.(string))
if err != nil {
return nil, nil, err
}
ts[i] = t
}
return ts, values, nil
}
// FetchDataPoints performs SELECT request to fetch
// datapoints for particular field.
func (c *Client) FetchDataPoints(s *Series) (*ChunkedResponse, error) {
iq := influx.Query{
Command: s.fetchQuery(c.filterTime),
Database: c.database,
RetentionPolicy: c.retention,
Chunked: true,
ChunkSize: 1e4,
}
cr, err := c.QueryAsChunk(iq)
if err != nil {
return nil, fmt.Errorf("query %q err: %s", iq.Command, err)
}
return &ChunkedResponse{cr, iq, s.Field}, nil
}
func (c *Client) fieldsByMeasurement() (map[string][]string, error) {
q := influx.Query{
Command: "show field keys",
Database: c.database,
RetentionPolicy: c.retention,
}
log.Printf("fetching fields: %s", stringify(q))
qValues, err := c.do(q)
if err != nil {
return nil, fmt.Errorf("error while executing query %q: %s", q.Command, err)
}
var total int
var skipped int
const fKey = "fieldKey"
const fType = "fieldType"
result := make(map[string][]string, len(qValues))
for _, qv := range qValues {
types := qv.values[fType]
fields := qv.values[fKey]
values := make([]string, 0)
for key, field := range fields {
if types[key].(string) == "string" {
skipped++
continue
}
values = append(values, field.(string))
total++
}
result[qv.name] = values
}
if skipped > 0 {
log.Printf("found %d fields; skipped %d non-numeric fields", total, skipped)
} else {
log.Printf("found %d fields", total)
}
return result, nil
}
func (c *Client) getSeries() ([]*Series, error) {
com := "show series"
if c.filterSeries != "" {
com = fmt.Sprintf("%s %s", com, c.filterSeries)
}
q := influx.Query{
Command: com,
Database: c.database,
RetentionPolicy: c.retention,
Chunked: true,
ChunkSize: c.chunkSize,
}
log.Printf("fetching series: %s", stringify(q))
cr, err := c.QueryAsChunk(q)
if err != nil {
return nil, fmt.Errorf("error while executing query %q: %s", q.Command, err)
}
const key = "key"
var result []*Series
for {
resp, err := cr.NextResponse()
if err != nil {
if err == io.EOF {
break
}
return nil, err
}
if resp.Error() != nil {
return nil, fmt.Errorf("response error for query %q: %s", q.Command, resp.Error())
}
qValues, err := parseResult(resp.Results[0])
if err != nil {
return nil, err
}
for _, qv := range qValues {
for _, v := range qv.values[key] {
s := &Series{}
if err := s.unmarshal(v.(string)); err != nil {
return nil, err
}
result = append(result, s)
}
}
}
log.Printf("found %d series", len(result))
return result, nil
}
func (c *Client) do(q influx.Query) ([]queryValues, error) {
res, err := c.Query(q)
if err != nil {
return nil, fmt.Errorf("query %q err: %s", q.Command, err)
}
if len(res.Results) < 1 {
return nil, fmt.Errorf("exploration query %q returned 0 results", q.Command)
}
return parseResult(res.Results[0])
}

View File

@@ -0,0 +1,127 @@
package influx
import "testing"
func TestFetchQuery(t *testing.T) {
testCases := []struct {
s Series
timeFilter string
expected string
}{
{
s: Series{
Measurement: "cpu",
Field: "value",
LabelPairs: []LabelPair{
{
Name: "foo",
Value: "bar",
},
},
},
expected: `select "value" from "cpu" where "foo"='bar'`,
},
{
s: Series{
Measurement: "cpu",
Field: "value",
LabelPairs: []LabelPair{
{
Name: "foo",
Value: "bar",
},
{
Name: "baz",
Value: "qux",
},
},
},
expected: `select "value" from "cpu" where "foo"='bar' and "baz"='qux'`,
},
{
s: Series{
Measurement: "cpu",
Field: "value",
LabelPairs: []LabelPair{
{
Name: "foo",
Value: "b'ar",
},
},
},
timeFilter: "time >= now()",
expected: `select "value" from "cpu" where "foo"='b\'ar' and time >= now()`,
},
{
s: Series{
Measurement: "cpu",
Field: "value",
LabelPairs: []LabelPair{
{
Name: "name",
Value: `dev-mapper-centos\x2dswap.swap`,
},
{
Name: "state",
Value: "dev-mapp'er-c'en'tos",
},
},
},
timeFilter: "time >= now()",
expected: `select "value" from "cpu" where "name"='dev-mapper-centos\\x2dswap.swap' and "state"='dev-mapp\'er-c\'en\'tos' and time >= now()`,
},
{
s: Series{
Measurement: "cpu",
Field: "value",
},
timeFilter: "time >= now()",
expected: `select "value" from "cpu" where time >= now()`,
},
{
s: Series{
Measurement: "cpu",
Field: "value",
},
expected: `select "value" from "cpu"`,
},
}
for _, tc := range testCases {
query := tc.s.fetchQuery(tc.timeFilter)
if query != tc.expected {
t.Fatalf("got: \n%s;\nexpected: \n%s", query, tc.expected)
}
}
}
func TestTimeFilter(t *testing.T) {
testCases := []struct {
start string
end string
expected string
}{
{
start: "2020-01-01T20:07:00Z",
end: "2020-01-01T21:07:00Z",
expected: "time >= '2020-01-01T20:07:00Z' and time <= '2020-01-01T21:07:00Z'",
},
{
expected: "",
},
{
start: "2020-01-01T20:07:00Z",
expected: "time >= '2020-01-01T20:07:00Z'",
},
{
end: "2020-01-01T21:07:00Z",
expected: "time <= '2020-01-01T21:07:00Z'",
},
}
for _, tc := range testCases {
f := timeFilter(tc.start, tc.end)
if f != tc.expected {
t.Fatalf("got: \n%q;\nexpected: \n%q", f, tc.expected)
}
}
}

191
app/vmctl/influx/parser.go Normal file
View File

@@ -0,0 +1,191 @@
package influx
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
influx "github.com/influxdata/influxdb/client/v2"
)
type queryValues struct {
name string
values map[string][]interface{}
}
func parseResult(r influx.Result) ([]queryValues, error) {
if len(r.Err) > 0 {
return nil, fmt.Errorf("result error: %s", r.Err)
}
qValues := make([]queryValues, len(r.Series))
for i, row := range r.Series {
values := make(map[string][]interface{}, len(row.Values))
for _, value := range row.Values {
for idx, v := range value {
key := row.Columns[idx]
values[key] = append(values[key], v)
}
}
qValues[i] = queryValues{
name: row.Name,
values: values,
}
}
return qValues, nil
}
func toFloat64(v interface{}) (float64, error) {
switch i := v.(type) {
case json.Number:
return i.Float64()
case float64:
return i, nil
case float32:
return float64(i), nil
case int64:
return float64(i), nil
case int32:
return float64(i), nil
case int:
return float64(i), nil
case uint64:
return float64(i), nil
case uint32:
return float64(i), nil
case uint:
return float64(i), nil
case string:
return strconv.ParseFloat(i, 64)
default:
return 0, fmt.Errorf("unexpected value type %v", i)
}
}
func parseDate(dateStr string) (int64, error) {
startTime, err := time.Parse(time.RFC3339, dateStr)
if err != nil {
return 0, fmt.Errorf("cannot parse %q: %s", dateStr, err)
}
return startTime.UnixNano() / 1e6, nil
}
func stringify(q influx.Query) string {
return fmt.Sprintf("command: %q; database: %q; retention: %q",
q.Command, q.Database, q.RetentionPolicy)
}
func (s *Series) unmarshal(v string) error {
noEscapeChars := strings.IndexByte(v, '\\') < 0
n := nextUnescapedChar(v, ',', noEscapeChars)
if n < 0 {
s.Measurement = unescapeTagValue(v, noEscapeChars)
return nil
}
s.Measurement = unescapeTagValue(v[:n], noEscapeChars)
var err error
s.LabelPairs, err = unmarshalTags(v[n+1:], noEscapeChars)
if err != nil {
return fmt.Errorf("failed to unmarhsal tags: %s", err)
}
return nil
}
func unmarshalTags(s string, noEscapeChars bool) ([]LabelPair, error) {
var result []LabelPair
for {
lp := LabelPair{}
n := nextUnescapedChar(s, ',', noEscapeChars)
if n < 0 {
if err := lp.unmarshal(s, noEscapeChars); err != nil {
return nil, err
}
if len(lp.Name) == 0 || len(lp.Value) == 0 {
return nil, nil
}
result = append(result, lp)
return result, nil
}
if err := lp.unmarshal(s[:n], noEscapeChars); err != nil {
return nil, err
}
s = s[n+1:]
if len(lp.Name) == 0 || len(lp.Value) == 0 {
continue
}
result = append(result, lp)
}
}
func (lp *LabelPair) unmarshal(s string, noEscapeChars bool) error {
n := nextUnescapedChar(s, '=', noEscapeChars)
if n < 0 {
return fmt.Errorf("missing tag value for %q", s)
}
lp.Name = unescapeTagValue(s[:n], noEscapeChars)
lp.Value = unescapeTagValue(s[n+1:], noEscapeChars)
return nil
}
func unescapeTagValue(s string, noEscapeChars bool) string {
if noEscapeChars {
// Fast path - no escape chars.
return s
}
n := strings.IndexByte(s, '\\')
if n < 0 {
return s
}
// Slow path. Remove escape chars.
dst := make([]byte, 0, len(s))
for {
dst = append(dst, s[:n]...)
s = s[n+1:]
if len(s) == 0 {
return string(append(dst, '\\'))
}
ch := s[0]
if ch != ' ' && ch != ',' && ch != '=' && ch != '\\' {
dst = append(dst, '\\')
}
dst = append(dst, ch)
s = s[1:]
n = strings.IndexByte(s, '\\')
if n < 0 {
return string(append(dst, s...))
}
}
}
func nextUnescapedChar(s string, ch byte, noEscapeChars bool) int {
if noEscapeChars {
// Fast path: just search for ch in s, since s has no escape chars.
return strings.IndexByte(s, ch)
}
sOrig := s
again:
n := strings.IndexByte(s, ch)
if n < 0 {
return -1
}
if n == 0 {
return len(sOrig) - len(s) + n
}
if s[n-1] != '\\' {
return len(sOrig) - len(s) + n
}
nOrig := n
slashes := 0
for n > 0 && s[n-1] == '\\' {
slashes++
n--
}
if slashes&1 == 0 {
return len(sOrig) - len(s) + nOrig
}
s = s[nOrig+1:]
goto again
}

View File

@@ -0,0 +1,60 @@
package influx
import (
"reflect"
"testing"
)
func TestSeries_Unmarshal(t *testing.T) {
tag := func(name, value string) LabelPair {
return LabelPair{
Name: name,
Value: value,
}
}
series := func(measurement string, lp ...LabelPair) Series {
return Series{
Measurement: measurement,
LabelPairs: lp,
}
}
testCases := []struct {
got string
want Series
}{
{
got: "cpu",
want: series("cpu"),
},
{
got: "cpu,host=localhost",
want: series("cpu", tag("host", "localhost")),
},
{
got: "cpu,host=localhost,instance=instance",
want: series("cpu", tag("host", "localhost"), tag("instance", "instance")),
},
{
got: `fo\,bar\=baz,x\=\b=\\a\,\=\q\ `,
want: series("fo,bar=baz", tag(`x=\b`, `\a,=\q `)),
},
{
got: "cpu,host=192.168.0.1,instance=fe80::fdc8:5e36:c2c6:baac%utun1",
want: series("cpu", tag("host", "192.168.0.1"), tag("instance", "fe80::fdc8:5e36:c2c6:baac%utun1")),
},
{
got: `cpu,db=db1,host=localhost,server=host\=localhost\ user\=user\ `,
want: series("cpu", tag("db", "db1"),
tag("host", "localhost"), tag("server", "host=localhost user=user ")),
},
}
for _, tc := range testCases {
s := Series{}
if err := s.unmarshal(tc.got); err != nil {
t.Fatalf("%q: unmarshal err: %s", tc.got, err)
}
if !reflect.DeepEqual(s, tc.want) {
t.Fatalf("%q: expected\n%#v\nto be equal\n%#v", tc.got, s, tc.want)
}
}
}

159
app/vmctl/main.go Normal file
View File

@@ -0,0 +1,159 @@
package main
import (
"fmt"
"log"
"os"
"os/signal"
"strings"
"syscall"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/urfave/cli/v2"
)
func main() {
start := time.Now()
app := &cli.App{
Name: "vmctl",
Usage: "Victoria metrics command-line tool",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "influx",
Usage: "Migrate timeseries from InfluxDB",
Flags: mergeFlags(globalFlags, influxFlags, vmFlags),
Action: func(c *cli.Context) error {
fmt.Println("InfluxDB import mode")
iCfg := influx.Config{
Addr: c.String(influxAddr),
Username: c.String(influxUser),
Password: c.String(influxPassword),
Database: c.String(influxDB),
Retention: c.String(influxRetention),
Filter: influx.Filter{
Series: c.String(influxFilterSeries),
TimeStart: c.String(influxFilterTimeStart),
TimeEnd: c.String(influxFilterTimeEnd),
},
ChunkSize: c.Int(influxChunkSize),
}
influxClient, err := influx.NewClient(iCfg)
if err != nil {
return fmt.Errorf("failed to create influx client: %s", err)
}
vmCfg := initConfigVM(c)
importer, err := vm.NewImporter(vmCfg)
if err != nil {
return fmt.Errorf("failed to create VM importer: %s", err)
}
processor := newInfluxProcessor(influxClient, importer,
c.Int(influxConcurrency), c.String(influxMeasurementFieldSeparator))
return processor.run(c.Bool(globalSilent))
},
},
{
Name: "prometheus",
Usage: "Migrate timeseries from Prometheus",
Flags: mergeFlags(globalFlags, promFlags, vmFlags),
Action: func(c *cli.Context) error {
fmt.Println("Prometheus import mode")
vmCfg := initConfigVM(c)
importer, err := vm.NewImporter(vmCfg)
if err != nil {
return fmt.Errorf("failed to create VM importer: %s", err)
}
promCfg := prometheus.Config{
Snapshot: c.String(promSnapshot),
Filter: prometheus.Filter{
TimeMin: c.String(promFilterTimeStart),
TimeMax: c.String(promFilterTimeEnd),
Label: c.String(promFilterLabel),
LabelValue: c.String(promFilterLabelValue),
},
}
cl, err := prometheus.NewClient(promCfg)
if err != nil {
return fmt.Errorf("failed to create prometheus client: %s", err)
}
pp := prometheusProcessor{
cl: cl,
im: importer,
cc: c.Int(promConcurrency),
}
return pp.run(c.Bool(globalSilent))
},
},
{
Name: "vm-native",
Usage: "Migrate time series between VictoriaMetrics installations via native binary format",
Flags: vmNativeFlags,
Action: func(c *cli.Context) error {
fmt.Println("VictoriaMetrics Native import mode")
if c.String(vmNativeFilterMatch) == "" {
return fmt.Errorf("flag %q can't be empty", vmNativeFilterMatch)
}
p := vmNativeProcessor{
filter: filter{
match: c.String(vmNativeFilterMatch),
timeStart: c.String(vmNativeFilterTimeStart),
timeEnd: c.String(vmNativeFilterTimeEnd),
},
src: &vmNativeClient{
addr: strings.Trim(c.String(vmNativeSrcAddr), "/"),
user: c.String(vmNativeSrcUser),
password: c.String(vmNativeSrcPassword),
},
dst: &vmNativeClient{
addr: strings.Trim(c.String(vmNativeDstAddr), "/"),
user: c.String(vmNativeDstUser),
password: c.String(vmNativeDstPassword),
extraLabels: c.StringSlice(vmExtraLabel),
},
}
return p.run()
},
},
},
}
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-c
fmt.Println("\r- Execution cancelled")
os.Exit(0)
}()
err := app.Run(os.Args)
if err != nil {
log.Fatal(err)
}
log.Printf("Total time: %v", time.Since(start))
}
func initConfigVM(c *cli.Context) vm.Config {
return vm.Config{
Addr: c.String(vmAddr),
User: c.String(vmUser),
Password: c.String(vmPassword),
Concurrency: uint8(c.Int(vmConcurrency)),
Compress: c.Bool(vmCompress),
AccountID: c.String(vmAccountID),
BatchSize: c.Int(vmBatchSize),
SignificantFigures: c.Int(vmSignificantFigures),
RoundDigits: c.Int(vmRoundDigits),
ExtraLabels: c.StringSlice(vmExtraLabel),
}
}

131
app/vmctl/prometheus.go Normal file
View File

@@ -0,0 +1,131 @@
package main
import (
"fmt"
"log"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/cheggaaa/pb/v3"
"github.com/prometheus/prometheus/tsdb"
)
type prometheusProcessor struct {
// prometheus client fetches and reads
// snapshot blocks
cl *prometheus.Client
// importer performs import requests
// for timeseries data returned from
// snapshot blocks
im *vm.Importer
// cc stands for concurrency
// and defines number of concurrently
// running snapshot block readers
cc int
}
func (pp *prometheusProcessor) run(silent bool) error {
blocks, err := pp.cl.Explore()
if err != nil {
return fmt.Errorf("explore failed: %s", err)
}
if len(blocks) < 1 {
return fmt.Errorf("found no blocks to import")
}
question := fmt.Sprintf("Found %d blocks to import. Continue?", len(blocks))
if !silent && !prompt(question) {
return nil
}
bar := pb.StartNew(len(blocks))
blockReadersCh := make(chan tsdb.BlockReader)
errCh := make(chan error, pp.cc)
pp.im.ResetStats()
var wg sync.WaitGroup
wg.Add(pp.cc)
for i := 0; i < pp.cc; i++ {
go func() {
defer wg.Done()
for br := range blockReadersCh {
if err := pp.do(br); err != nil {
errCh <- fmt.Errorf("read failed for block %q: %s", br.Meta().ULID, err)
return
}
bar.Increment()
}
}()
}
// any error breaks the import
for _, br := range blocks {
select {
case promErr := <-errCh:
close(blockReadersCh)
return fmt.Errorf("prometheus error: %s", promErr)
case vmErr := <-pp.im.Errors():
close(blockReadersCh)
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
case blockReadersCh <- br:
}
}
close(blockReadersCh)
wg.Wait()
// wait for all buffers to flush
pp.im.Close()
// drain import errors channel
for vmErr := range pp.im.Errors() {
return fmt.Errorf("Import process failed: \n%s", wrapErr(vmErr))
}
bar.Finish()
log.Println("Import finished!")
log.Print(pp.im.Stats())
return nil
}
func (pp *prometheusProcessor) do(b tsdb.BlockReader) error {
ss, err := pp.cl.Read(b)
if err != nil {
return fmt.Errorf("failed to read block: %s", err)
}
for ss.Next() {
var name string
var labels []vm.LabelPair
series := ss.At()
for _, label := range series.Labels() {
if label.Name == "__name__" {
name = label.Value
continue
}
labels = append(labels, vm.LabelPair{
Name: label.Name,
Value: label.Value,
})
}
if name == "" {
return fmt.Errorf("failed to find `__name__` label in labelset for block %v", b.Meta().ULID)
}
var timestamps []int64
var values []float64
it := series.Iterator()
for it.Next() {
t, v := it.At()
timestamps = append(timestamps, t)
values = append(values, v)
}
if err := it.Err(); err != nil {
return err
}
pp.im.Input() <- &vm.TimeSeries{
Name: name,
LabelPairs: labels,
Timestamps: timestamps,
Values: values,
}
}
return ss.Err()
}

Some files were not shown because too many files have changed in this diff Show More